Skip to main content
edited tags
Link
Reinderien
  • 71.2k
  • 5
  • 76
  • 257
Bumped by Community user
Tweeted twitter.com/StackCodeReview/status/1326132543490568192
Rollback to Revision 2
Source Link
pacmaninbw
  • 26.2k
  • 13
  • 47
  • 114

This is my first time web scraping, and here is the code I whipped up:

from bs4 import BeautifulSoup
import requests
import time

keywords = ['python']

for n in range(1000):
    res = requests.get(f"https://stackoverflow.com/questions?tab=newest&page={n}")
    time.sleep(3) # Sleep to avoid getting rate limited again
    soup = BeautifulSoup(res.text, "html.parser")
    questions = soup.select(".question-summary") # List of all question summaries in the current page
    for que in questions:
        found = False
        tagged = False
        q = que.select_one('.question-hyperlink').getText() # Store the title of the question
        for a in que.find_all('a', href=True):
            u = a['href'] # Store the link
            if u.split('/')[1] == 'questions' and u.split('/')[2] != 'tagged': # If this link is a question and not a tag
                res2 = requests.get("https://stackoverflow.com"+ucom" + u) # Send request for that question
                time.sleep(3) # Extra precaution to avoid getting rate limited again
                soup2 = BeautifulSoup(res2.text, "html.parser")
                body = str(soup2.select(".s-prose")) # This is the body of the question
                if any(key in body for key in keywords):
                    found = True
            if 'tagged/python' in u:
                tagged = True

        if found and not tagged:
            print(q)

My code basically scrapes Stack Overflow posts newest first, and prints out all the posts that has the keyword "python" in its body, but no tag. I want to know, did I implement the algorithm optimally? Can you show me where to improve?

This is my first time web scraping, and here is the code I whipped up:

from bs4 import BeautifulSoup
import requests
import time

keywords = ['python']

for n in range(1000):
    res = requests.get(f"https://stackoverflow.com/questions?tab=newest&page={n}")
    time.sleep(3)
    soup = BeautifulSoup(res.text, "html.parser")
    questions = soup.select(".question-summary")
    for que in questions:
        found = False
        tagged = False
        q = que.select_one('.question-hyperlink').getText()
        for a in que.find_all('a', href=True):
            u = a['href']
            if u.split('/')[1] == 'questions' and u.split('/')[2] != 'tagged':
                res2 = requests.get("https://stackoverflow.com"+u)
                time.sleep(3)
                soup2 = BeautifulSoup(res2.text, "html.parser")
                body = str(soup2.select(".s-prose"))
                if any(key in body for key in keywords):
                    found = True
            if 'tagged/python' in u:
                tagged = True

        if found and not tagged:
            print(q)

My code basically scrapes Stack Overflow posts newest first, and prints out all the posts that has the keyword "python" in its body, but no tag. I want to know, did I implement the algorithm optimally? Can you show me where to improve?

This is my first time web scraping, and here is the code I whipped up:

from bs4 import BeautifulSoup
import requests
import time

keywords = ['python']

for n in range(1000):
    res = requests.get(f"https://stackoverflow.com/questions?tab=newest&page={n}")
    time.sleep(3) # Sleep to avoid getting rate limited again
    soup = BeautifulSoup(res.text, "html.parser")
    questions = soup.select(".question-summary") # List of all question summaries in the current page
    for que in questions:
        found = False
        tagged = False
        q = que.select_one('.question-hyperlink').getText() # Store the title of the question
        for a in que.find_all('a', href=True):
            u = a['href'] # Store the link
            if u.split('/')[1] == 'questions' and u.split('/')[2] != 'tagged': # If this link is a question and not a tag
                res2 = requests.get("https://stackoverflow.com" + u) # Send request for that question
                time.sleep(3) # Extra precaution to avoid getting rate limited again
                soup2 = BeautifulSoup(res2.text, "html.parser")
                body = str(soup2.select(".s-prose")) # This is the body of the question
                if any(key in body for key in keywords):
                    found = True
            if 'tagged/python' in u:
                tagged = True

        if found and not tagged:
            print(q)

My code basically scrapes Stack Overflow posts newest first, and prints out all the posts that has the keyword "python" in its body, but no tag. I want to know, did I implement the algorithm optimally? Can you show me where to improve?

Rollback to Revision 1
Source Link
pacmaninbw
  • 26.2k
  • 13
  • 47
  • 114

This is my first time web scraping, and here is the code I whipped up:

from bs4 import BeautifulSoup
import requests
import time

keywords = ['python']

for n in range(1000):
    res = requests.get(f"https://stackoverflow.com/questions?tab=newest&page={n}")
    time.sleep(3) # Sleep to avoid getting rate limited again
    soup = BeautifulSoup(res.text, "html.parser")
    questions = soup.select(".question-summary") # List of all question summaries in the current page
    for que in questions:
        found = False
        tagged = False
        q = que.select_one('.question-hyperlink').getText() # Store the title of the question
        for a in que.find_all('a', href=True):
            u = a['href'] # Store the link
            if u.split('/')[1] == 'questions' and u.split('/')[2] != 'tagged': # If this link is a question and not a tag
                res2 = requests.get("https://stackoverflow.com" + ucom"+u) # Send request for that question
                time.sleep(3) # Extra precaution to avoid getting rate limited again
                soup2 = BeautifulSoup(res2.text, "html.parser")
                body = str(soup2.select(".s-prose")) # This is the body of the question
                if any(key in body for key in keywords):
                    found = True
            if 'tagged/python' in u:
                tagged = True

        if found and not tagged:
            print(q)

My code basically scrapes Stack Overflow posts newest first, and prints out all the posts that has the keyword "python" in its body, but no tag. I want to know, did I implement the algorithm optimally? Can you show me where to improve?

This is my first time web scraping, and here is the code I whipped up:

from bs4 import BeautifulSoup
import requests
import time

keywords = ['python']

for n in range(1000):
    res = requests.get(f"https://stackoverflow.com/questions?tab=newest&page={n}")
    time.sleep(3) # Sleep to avoid getting rate limited again
    soup = BeautifulSoup(res.text, "html.parser")
    questions = soup.select(".question-summary") # List of all question summaries in the current page
    for que in questions:
        found = False
        tagged = False
        q = que.select_one('.question-hyperlink').getText() # Store the title of the question
        for a in que.find_all('a', href=True):
            u = a['href'] # Store the link
            if u.split('/')[1] == 'questions' and u.split('/')[2] != 'tagged': # If this link is a question and not a tag
                res2 = requests.get("https://stackoverflow.com" + u) # Send request for that question
                time.sleep(3) # Extra precaution to avoid getting rate limited again
                soup2 = BeautifulSoup(res2.text, "html.parser")
                body = str(soup2.select(".s-prose")) # This is the body of the question
                if any(key in body for key in keywords):
                    found = True
            if 'tagged/python' in u:
                tagged = True

        if found and not tagged:
            print(q)

My code basically scrapes Stack Overflow posts newest first, and prints out all the posts that has the keyword "python" in its body, but no tag. I want to know, did I implement the algorithm optimally? Can you show me where to improve?

This is my first time web scraping, and here is the code I whipped up:

from bs4 import BeautifulSoup
import requests
import time

keywords = ['python']

for n in range(1000):
    res = requests.get(f"https://stackoverflow.com/questions?tab=newest&page={n}")
    time.sleep(3)
    soup = BeautifulSoup(res.text, "html.parser")
    questions = soup.select(".question-summary")
    for que in questions:
        found = False
        tagged = False
        q = que.select_one('.question-hyperlink').getText()
        for a in que.find_all('a', href=True):
            u = a['href']
            if u.split('/')[1] == 'questions' and u.split('/')[2] != 'tagged':
                res2 = requests.get("https://stackoverflow.com"+u)
                time.sleep(3)
                soup2 = BeautifulSoup(res2.text, "html.parser")
                body = str(soup2.select(".s-prose"))
                if any(key in body for key in keywords):
                    found = True
            if 'tagged/python' in u:
                tagged = True

        if found and not tagged:
            print(q)

My code basically scrapes Stack Overflow posts newest first, and prints out all the posts that has the keyword "python" in its body, but no tag. I want to know, did I implement the algorithm optimally? Can you show me where to improve?

added 316 characters in body
Source Link
Chocolate
  • 1k
  • 5
  • 21
Loading
Source Link
Chocolate
  • 1k
  • 5
  • 21
Loading