Revisions to Beautifulsoup scrape posts that has the word python in it with no python tag

edited tags

Link

edited Mar 22, 2022 at 15:05

Reinderien

71.2k
5
76
257

Bumped by Community user

occurred Dec 10, 2020 at 4:08

Tweeted twitter.com/StackCodeReview/status/1326132543490568192

occurred Nov 10, 2020 at 12:00

Rollback to Revision 2

Source Link

edited Nov 10, 2020 at 3:14

pacmaninbw ♦

26.2k
13
47
114

This is my first time web scraping, and here is the code I whipped up:

from bs4 import BeautifulSoup
import requests
import time

keywords = ['python']

for n in range(1000):
    res = requests.get(f"https://stackoverflow.com/questions?tab=newest&page={n}")
    time.sleep(3) # Sleep to avoid getting rate limited again
    soup = BeautifulSoup(res.text, "html.parser")
    questions = soup.select(".question-summary") # List of all question summaries in the current page
    for que in questions:
        found = False
        tagged = False
        q = que.select_one('.question-hyperlink').getText() # Store the title of the question
        for a in que.find_all('a', href=True):
            u = a['href'] # Store the link
            if u.split('/')[1] == 'questions' and u.split('/')[2] != 'tagged': # If this link is a question and not a tag
                res2 = requests.get("https://stackoverflow.com"+ucom" + u) # Send request for that question
                time.sleep(3) # Extra precaution to avoid getting rate limited again
                soup2 = BeautifulSoup(res2.text, "html.parser")
                body = str(soup2.select(".s-prose")) # This is the body of the question
                if any(key in body for key in keywords):
                    found = True
            if 'tagged/python' in u:
                tagged = True

        if found and not tagged:
            print(q)

My code basically scrapes Stack Overflow posts newest first, and prints out all the posts that has the keyword "python" in its body, but no python tag. I want to know, did I implement the algorithm optimally? Can you show me where to improve?

This is my first time web scraping, and here is the code I whipped up:

from bs4 import BeautifulSoup
import requests
import time

keywords = ['python']

for n in range(1000):
    res = requests.get(f"https://stackoverflow.com/questions?tab=newest&page={n}")
    time.sleep(3)
    soup = BeautifulSoup(res.text, "html.parser")
    questions = soup.select(".question-summary")
    for que in questions:
        found = False
        tagged = False
        q = que.select_one('.question-hyperlink').getText()
        for a in que.find_all('a', href=True):
            u = a['href']
            if u.split('/')[1] == 'questions' and u.split('/')[2] != 'tagged':
                res2 = requests.get("https://stackoverflow.com"+u)
                time.sleep(3)
                soup2 = BeautifulSoup(res2.text, "html.parser")
                body = str(soup2.select(".s-prose"))
                if any(key in body for key in keywords):
                    found = True
            if 'tagged/python' in u:
                tagged = True

        if found and not tagged:
            print(q)

My code basically scrapes Stack Overflow posts newest first, and prints out all the posts that has the keyword "python" in its body, but no python tag. I want to know, did I implement the algorithm optimally? Can you show me where to improve?

This is my first time web scraping, and here is the code I whipped up:

from bs4 import BeautifulSoup
import requests
import time

keywords = ['python']

for n in range(1000):
    res = requests.get(f"https://stackoverflow.com/questions?tab=newest&page={n}")
    time.sleep(3) # Sleep to avoid getting rate limited again
    soup = BeautifulSoup(res.text, "html.parser")
    questions = soup.select(".question-summary") # List of all question summaries in the current page
    for que in questions:
        found = False
        tagged = False
        q = que.select_one('.question-hyperlink').getText() # Store the title of the question
        for a in que.find_all('a', href=True):
            u = a['href'] # Store the link
            if u.split('/')[1] == 'questions' and u.split('/')[2] != 'tagged': # If this link is a question and not a tag
                res2 = requests.get("https://stackoverflow.com" + u) # Send request for that question
                time.sleep(3) # Extra precaution to avoid getting rate limited again
                soup2 = BeautifulSoup(res2.text, "html.parser")
                body = str(soup2.select(".s-prose")) # This is the body of the question
                if any(key in body for key in keywords):
                    found = True
            if 'tagged/python' in u:
                tagged = True

        if found and not tagged:
            print(q)

My code basically scrapes Stack Overflow posts newest first, and prints out all the posts that has the keyword "python" in its body, but no python tag. I want to know, did I implement the algorithm optimally? Can you show me where to improve?

Rollback to Revision 1

Source Link

edited Nov 10, 2020 at 3:13

pacmaninbw ♦

26.2k
13
47
114

This is my first time web scraping, and here is the code I whipped up:

from bs4 import BeautifulSoup
import requests
import time

keywords = ['python']

for n in range(1000):
    res = requests.get(f"https://stackoverflow.com/questions?tab=newest&page={n}")
    time.sleep(3) # Sleep to avoid getting rate limited again
    soup = BeautifulSoup(res.text, "html.parser")
    questions = soup.select(".question-summary") # List of all question summaries in the current page
    for que in questions:
        found = False
        tagged = False
        q = que.select_one('.question-hyperlink').getText() # Store the title of the question
        for a in que.find_all('a', href=True):
            u = a['href'] # Store the link
            if u.split('/')[1] == 'questions' and u.split('/')[2] != 'tagged': # If this link is a question and not a tag
                res2 = requests.get("https://stackoverflow.com" + ucom"+u) # Send request for that question
                time.sleep(3) # Extra precaution to avoid getting rate limited again
                soup2 = BeautifulSoup(res2.text, "html.parser")
                body = str(soup2.select(".s-prose")) # This is the body of the question
                if any(key in body for key in keywords):
                    found = True
            if 'tagged/python' in u:
                tagged = True

        if found and not tagged:
            print(q)

My code basically scrapes Stack Overflow posts newest first, and prints out all the posts that has the keyword "python" in its body, but no python tag. I want to know, did I implement the algorithm optimally? Can you show me where to improve?

This is my first time web scraping, and here is the code I whipped up:

from bs4 import BeautifulSoup
import requests
import time

keywords = ['python']

for n in range(1000):
    res = requests.get(f"https://stackoverflow.com/questions?tab=newest&page={n}")
    time.sleep(3) # Sleep to avoid getting rate limited again
    soup = BeautifulSoup(res.text, "html.parser")
    questions = soup.select(".question-summary") # List of all question summaries in the current page
    for que in questions:
        found = False
        tagged = False
        q = que.select_one('.question-hyperlink').getText() # Store the title of the question
        for a in que.find_all('a', href=True):
            u = a['href'] # Store the link
            if u.split('/')[1] == 'questions' and u.split('/')[2] != 'tagged': # If this link is a question and not a tag
                res2 = requests.get("https://stackoverflow.com" + u) # Send request for that question
                time.sleep(3) # Extra precaution to avoid getting rate limited again
                soup2 = BeautifulSoup(res2.text, "html.parser")
                body = str(soup2.select(".s-prose")) # This is the body of the question
                if any(key in body for key in keywords):
                    found = True
            if 'tagged/python' in u:
                tagged = True

        if found and not tagged:
            print(q)

My code basically scrapes Stack Overflow posts newest first, and prints out all the posts that has the keyword "python" in its body, but no python tag. I want to know, did I implement the algorithm optimally? Can you show me where to improve?

This is my first time web scraping, and here is the code I whipped up:

from bs4 import BeautifulSoup
import requests
import time

keywords = ['python']

for n in range(1000):
    res = requests.get(f"https://stackoverflow.com/questions?tab=newest&page={n}")
    time.sleep(3)
    soup = BeautifulSoup(res.text, "html.parser")
    questions = soup.select(".question-summary")
    for que in questions:
        found = False
        tagged = False
        q = que.select_one('.question-hyperlink').getText()
        for a in que.find_all('a', href=True):
            u = a['href']
            if u.split('/')[1] == 'questions' and u.split('/')[2] != 'tagged':
                res2 = requests.get("https://stackoverflow.com"+u)
                time.sleep(3)
                soup2 = BeautifulSoup(res2.text, "html.parser")
                body = str(soup2.select(".s-prose"))
                if any(key in body for key in keywords):
                    found = True
            if 'tagged/python' in u:
                tagged = True

        if found and not tagged:
            print(q)

My code basically scrapes Stack Overflow posts newest first, and prints out all the posts that has the keyword "python" in its body, but no python tag. I want to know, did I implement the algorithm optimally? Can you show me where to improve?

added 316 characters in body

Source Link

edited Nov 10, 2020 at 2:38

Chocolate

1k
5
21

Loading

Source Link

asked Nov 9, 2020 at 2:23

Chocolate

1k
5
21

Loading

Stack Exchange Network

Return to Question