Rollback to Revision 2

Source Link

edited Sep 2, 2019 at 8:57

51k
5
83
177

As for the code below, I'd appreciate any feedback, particularly in regards to threading, as I'm new to this.

#!/usr/bin/python3

import os
import sys
import time
import threading
from pathlib import Path
from shutil import copyfileobj


import requests
from lxml import html


BASE_URL = "https://www.xkcd.com/"
ARCHIVE = "https://www.xkcd.com/archive"
SAVE_DIRECTORY = Path('xkcd_comics')
LOGO = """
       _           _                      
 tiny | |  image  | | downloader for
 __  _| | _____ __| |  ___ ___  _ __ ___  
 \ \/ / |/ / __/ _` | / __/ _ \| '_ ` _ \ 
  >  <|   < (_| (_| || (_| (_) | | | | | |
 /_/\_\_|\_\___\__,_(_)___\___/|_| |_| |_|
 version 0.1
"""


def show_logo():
    print(LOGO)


def fetch_url(url: str) -> requests.Response:
    return requests.get(url)


def head_option(values: list) -> str:
    return next(iter(values), None)


def get_penultimate(url: str) -> int:
    page = fetch_url(url)
    tree = html.fromstring(page.content)
    newest_comic = head_option(
        tree.xpath('//*[@id="middleContainer"]/a[1]/@href'))
    return int(newest_comic.replace("/", ""))


def get_images_from_page(url: str) -> str:
    page = fetch_url(url)
    tree = html.fromstring(page.content)
    return head_option(tree.xpath('//*[@id="comic"]//img/@src'))


def get_number_of_pages(latest_comic: int) -> int:
    print(f"There are {latest_comic} comics.")
    print(f"How many do you want to download? Type 0 to exit.")
    while True:
        try:
            number_of_comics = int(input(">> "))
        except ValueError:
            print("Error: Expected a number. Try again.")
            continue
        if number_of_comics > latest_comic or number_of_comics < 0:
            print("Error: Incorrect number of comics. Try again.")
            continue
        elif number_of_comics == 0:
            sys.exit()
        return number_of_comics


def clip_url(img: str) -> str:
    return img.rpartition("/")[-1]


def make_dir():
    return os.makedirs(SAVE_DIRECTORY, exist_ok=True)


def save_image(img: str):
    comic_name = clip_url(img)
    print(f"Downloading: {comic_name}")
    f_name = SAVE_DIRECTORY / comic_name
    with requests.get("https:" + img, stream=True) as img, open(f_name, "wb") \
            as output:
        copyfileobj(img.raw, output)


def show_time(seconds: int) -> int:
    minutes, seconds = divmod(seconds, 60)
    hours, minutes = divmod(minutes, 60)
    time_elapsed = f"{hours:02d}:{minutes:02d}:{seconds:02d}" 
    return time_elapsed


def get_xkcd():
    show_logo()
    make_dir()

    collect_garbage = []
    latest_comic = get_penultimate(ARCHIVE)
    pages = get_number_of_pages(latest_comic)

    start = time.time()
    for page in reversed(range(latest_comic - pages + 1, latest_comic + 1)):
        print(f"Fetching page {page} out of {latest_comic}")
        try:
            save_image(url = get_images_from_page(f"{BASE_URL}{page}/")
            thread = threading.Thread(target=save_image, args=(url, ))
            thread.start()
        except (ValueError, AttributeError, requests.exceptions.MissingSchema):
            print(f"WARNING: Invalid comic image source url.")
            collect_garbage.append(f"{BASE_URL}{page}")
            continue
    thread.join()
    end = time.time()

    print(f"Downloaded {pages} comic(s) in {show_time(int(end - start))}.")

    if len(collect_garbage) > 0:
        print("However, was unable to download images for these pages:")
        print("\n".join(page for page in collect_garbage))


def main():
    get_xkcd()


if __name__ == '__main__':
    main()

As for the code below, I'd appreciate any feedback.

#!/usr/bin/python3

import os
import sys
import time
import threading
from pathlib import Path
from shutil import copyfileobj


import requests
from lxml import html


BASE_URL = "https://www.xkcd.com/"
ARCHIVE = "https://www.xkcd.com/archive"
SAVE_DIRECTORY = Path('xkcd_comics')
LOGO = """
       _           _                      
 tiny | |  image  | | downloader for
 __  _| | _____ __| |  ___ ___  _ __ ___  
 \ \/ / |/ / __/ _` | / __/ _ \| '_ ` _ \ 
  >  <|   < (_| (_| || (_| (_) | | | | | |
 /_/\_\_|\_\___\__,_(_)___\___/|_| |_| |_|
 version 0.1
"""


def show_logo():
    print(LOGO)


def fetch_url(url: str) -> requests.Response:
    return requests.get(url)


def head_option(values: list) -> str:
    return next(iter(values), None)


def get_penultimate(url: str) -> int:
    page = fetch_url(url)
    tree = html.fromstring(page.content)
    newest_comic = head_option(
        tree.xpath('//*[@id="middleContainer"]/a[1]/@href'))
    return int(newest_comic.replace("/", ""))


def get_images_from_page(url: str) -> str:
    page = fetch_url(url)
    tree = html.fromstring(page.content)
    return head_option(tree.xpath('//*[@id="comic"]//img/@src'))


def get_number_of_pages(latest_comic: int) -> int:
    print(f"There are {latest_comic} comics.")
    print(f"How many do you want to download? Type 0 to exit.")
    while True:
        try:
            number_of_comics = int(input(">> "))
        except ValueError:
            print("Error: Expected a number. Try again.")
            continue
        if number_of_comics > latest_comic or number_of_comics < 0:
            print("Error: Incorrect number of comics. Try again.")
            continue
        elif number_of_comics == 0:
            sys.exit()
        return number_of_comics


def clip_url(img: str) -> str:
    return img.rpartition("/")[-1]


def make_dir():
    return os.makedirs(SAVE_DIRECTORY, exist_ok=True)


def save_image(img: str):
    comic_name = clip_url(img)
    print(f"Downloading: {comic_name}")
    f_name = SAVE_DIRECTORY / comic_name
    with requests.get("https:" + img, stream=True) as img, open(f_name, "wb") \
            as output:
        copyfileobj(img.raw, output)


def show_time(seconds: int) -> int:
    minutes, seconds = divmod(seconds, 60)
    hours, minutes = divmod(minutes, 60)
    time_elapsed = f"{hours:02d}:{minutes:02d}:{seconds:02d}" 
    return time_elapsed


def get_xkcd():
    show_logo()
    make_dir()

    collect_garbage = []
    latest_comic = get_penultimate(ARCHIVE)
    pages = get_number_of_pages(latest_comic)

    start = time.time()
    for page in reversed(range(latest_comic - pages + 1, latest_comic + 1)):
        print(f"Fetching page {page} out of {latest_comic}")
        try:
            save_image(get_images_from_page(f"{BASE_URL}{page}/"))
        except (ValueError, AttributeError, requests.exceptions.MissingSchema):
            print(f"WARNING: Invalid comic image source url.")
            collect_garbage.append(f"{BASE_URL}{page}")
            continue
    end = time.time()

    print(f"Downloaded {pages} comic(s) in {show_time(int(end - start))}.")

    if len(collect_garbage) > 0:
        print("However, was unable to download images for these pages:")
        print("\n".join(page for page in collect_garbage))


def main():
    get_xkcd()


if __name__ == '__main__':
    main()

As for the code below, I'd appreciate any feedback, particularly in regards to threading, as I'm new to this.

#!/usr/bin/python3

import os
import sys
import time
import threading
from pathlib import Path
from shutil import copyfileobj


import requests
from lxml import html


BASE_URL = "https://www.xkcd.com/"
ARCHIVE = "https://www.xkcd.com/archive"
SAVE_DIRECTORY = Path('xkcd_comics')
LOGO = """
       _           _                      
 tiny | |  image  | | downloader for
 __  _| | _____ __| |  ___ ___  _ __ ___  
 \ \/ / |/ / __/ _` | / __/ _ \| '_ ` _ \ 
  >  <|   < (_| (_| || (_| (_) | | | | | |
 /_/\_\_|\_\___\__,_(_)___\___/|_| |_| |_|
 version 0.1
"""


def show_logo():
    print(LOGO)


def fetch_url(url: str) -> requests.Response:
    return requests.get(url)


def head_option(values: list) -> str:
    return next(iter(values), None)


def get_penultimate(url: str) -> int:
    page = fetch_url(url)
    tree = html.fromstring(page.content)
    newest_comic = head_option(
        tree.xpath('//*[@id="middleContainer"]/a[1]/@href'))
    return int(newest_comic.replace("/", ""))


def get_images_from_page(url: str) -> str:
    page = fetch_url(url)
    tree = html.fromstring(page.content)
    return head_option(tree.xpath('//*[@id="comic"]//img/@src'))


def get_number_of_pages(latest_comic: int) -> int:
    print(f"There are {latest_comic} comics.")
    print(f"How many do you want to download? Type 0 to exit.")
    while True:
        try:
            number_of_comics = int(input(">> "))
        except ValueError:
            print("Error: Expected a number. Try again.")
            continue
        if number_of_comics > latest_comic or number_of_comics < 0:
            print("Error: Incorrect number of comics. Try again.")
            continue
        elif number_of_comics == 0:
            sys.exit()
        return number_of_comics


def clip_url(img: str) -> str:
    return img.rpartition("/")[-1]


def make_dir():
    return os.makedirs(SAVE_DIRECTORY, exist_ok=True)


def save_image(img: str):
    comic_name = clip_url(img)
    print(f"Downloading: {comic_name}")
    f_name = SAVE_DIRECTORY / comic_name
    with requests.get("https:" + img, stream=True) as img, open(f_name, "wb") \
            as output:
        copyfileobj(img.raw, output)


def show_time(seconds: int) -> int:
    minutes, seconds = divmod(seconds, 60)
    hours, minutes = divmod(minutes, 60)
    time_elapsed = f"{hours:02d}:{minutes:02d}:{seconds:02d}" 
    return time_elapsed


def get_xkcd():
    show_logo()
    make_dir()

    collect_garbage = []
    latest_comic = get_penultimate(ARCHIVE)
    pages = get_number_of_pages(latest_comic)

    start = time.time()
    for page in reversed(range(latest_comic - pages + 1, latest_comic + 1)):
        print(f"Fetching page {page} out of {latest_comic}")
        try:
            url = get_images_from_page(f"{BASE_URL}{page}/")
            thread = threading.Thread(target=save_image, args=(url, ))
            thread.start()
        except (ValueError, AttributeError, requests.exceptions.MissingSchema):
            print(f"WARNING: Invalid comic image source url.")
            collect_garbage.append(f"{BASE_URL}{page}")
            continue
    thread.join()
    end = time.time()

    print(f"Downloaded {pages} comic(s) in {show_time(int(end - start))}.")

    if len(collect_garbage) > 0:
        print("However, was unable to download images for these pages:")
        print("\n".join(page for page in collect_garbage))


def main():
    get_xkcd()


if __name__ == '__main__':
    main()

Removed threading from code to aviod DoS/DDoS.

Source Link

edited Sep 2, 2019 at 8:53

baduker

1.4k
15
30

As for the code below, I'd appreciate any feedback, particularly in regards to threading, as I'm new to this.

#!/usr/bin/python3

import os
import sys
import time
import threading
from pathlib import Path
from shutil import copyfileobj


import requests
from lxml import html


BASE_URL = "https://www.xkcd.com/"
ARCHIVE = "https://www.xkcd.com/archive"
SAVE_DIRECTORY = Path('xkcd_comics')
LOGO = """
       _           _                      
 tiny | |  image  | | downloader for
 __  _| | _____ __| |  ___ ___  _ __ ___  
 \ \/ / |/ / __/ _` | / __/ _ \| '_ ` _ \ 
  >  <|   < (_| (_| || (_| (_) | | | | | |
 /_/\_\_|\_\___\__,_(_)___\___/|_| |_| |_|
 version 0.1
"""


def show_logo():
    print(LOGO)


def fetch_url(url: str) -> requests.Response:
    return requests.get(url)


def head_option(values: list) -> str:
    return next(iter(values), None)


def get_penultimate(url: str) -> int:
    page = fetch_url(url)
    tree = html.fromstring(page.content)
    newest_comic = head_option(
        tree.xpath('//*[@id="middleContainer"]/a[1]/@href'))
    return int(newest_comic.replace("/", ""))


def get_images_from_page(url: str) -> str:
    page = fetch_url(url)
    tree = html.fromstring(page.content)
    return head_option(tree.xpath('//*[@id="comic"]//img/@src'))


def get_number_of_pages(latest_comic: int) -> int:
    print(f"There are {latest_comic} comics.")
    print(f"How many do you want to download? Type 0 to exit.")
    while True:
        try:
            number_of_comics = int(input(">> "))
        except ValueError:
            print("Error: Expected a number. Try again.")
            continue
        if number_of_comics > latest_comic or number_of_comics < 0:
            print("Error: Incorrect number of comics. Try again.")
            continue
        elif number_of_comics == 0:
            sys.exit()
        return number_of_comics


def clip_url(img: str) -> str:
    return img.rpartition("/")[-1]


def make_dir():
    return os.makedirs(SAVE_DIRECTORY, exist_ok=True)


def save_image(img: str):
    comic_name = clip_url(img)
    print(f"Downloading: {comic_name}")
    f_name = SAVE_DIRECTORY / comic_name
    with requests.get("https:" + img, stream=True) as img, open(f_name, "wb") \
            as output:
        copyfileobj(img.raw, output)


def show_time(seconds: int) -> int:
    minutes, seconds = divmod(seconds, 60)
    hours, minutes = divmod(minutes, 60)
    time_elapsed = f"{hours:02d}:{minutes:02d}:{seconds:02d}" 
    return time_elapsed


def get_xkcd():
    show_logo()
    make_dir()

    collect_garbage = []
    latest_comic = get_penultimate(ARCHIVE)
    pages = get_number_of_pages(latest_comic)

    start = time.time()
    for page in reversed(range(latest_comic - pages + 1, latest_comic + 1)):
        print(f"Fetching page {page} out of {latest_comic}")
        try:
            url = save_image(get_images_from_page(f"{BASE_URL}{page}/")
            thread = threading.Thread(target=save_image, args=(url, ))
            thread.start()
        except (ValueError, AttributeError, requests.exceptions.MissingSchema):
            print(f"WARNING: Invalid comic image source url.")
            collect_garbage.append(f"{BASE_URL}{page}")
            continue
    thread.join()
    end = time.time()

    print(f"Downloaded {pages} comic(s) in {show_time(int(end - start))}.")

    if len(collect_garbage) > 0:
        print("However, was unable to download images for these pages:")
        print("\n".join(page for page in collect_garbage))


def main():
    get_xkcd()


if __name__ == '__main__':
    main()

As for the code below, I'd appreciate any feedback, particularly in regards to threading, as I'm new to this.

#!/usr/bin/python3

import os
import sys
import time
import threading
from pathlib import Path
from shutil import copyfileobj


import requests
from lxml import html


BASE_URL = "https://www.xkcd.com/"
ARCHIVE = "https://www.xkcd.com/archive"
SAVE_DIRECTORY = Path('xkcd_comics')
LOGO = """
       _           _                      
 tiny | |  image  | | downloader for
 __  _| | _____ __| |  ___ ___  _ __ ___  
 \ \/ / |/ / __/ _` | / __/ _ \| '_ ` _ \ 
  >  <|   < (_| (_| || (_| (_) | | | | | |
 /_/\_\_|\_\___\__,_(_)___\___/|_| |_| |_|
 version 0.1
"""


def show_logo():
    print(LOGO)


def fetch_url(url: str) -> requests.Response:
    return requests.get(url)


def head_option(values: list) -> str:
    return next(iter(values), None)


def get_penultimate(url: str) -> int:
    page = fetch_url(url)
    tree = html.fromstring(page.content)
    newest_comic = head_option(
        tree.xpath('//*[@id="middleContainer"]/a[1]/@href'))
    return int(newest_comic.replace("/", ""))


def get_images_from_page(url: str) -> str:
    page = fetch_url(url)
    tree = html.fromstring(page.content)
    return head_option(tree.xpath('//*[@id="comic"]//img/@src'))


def get_number_of_pages(latest_comic: int) -> int:
    print(f"There are {latest_comic} comics.")
    print(f"How many do you want to download? Type 0 to exit.")
    while True:
        try:
            number_of_comics = int(input(">> "))
        except ValueError:
            print("Error: Expected a number. Try again.")
            continue
        if number_of_comics > latest_comic or number_of_comics < 0:
            print("Error: Incorrect number of comics. Try again.")
            continue
        elif number_of_comics == 0:
            sys.exit()
        return number_of_comics


def clip_url(img: str) -> str:
    return img.rpartition("/")[-1]


def make_dir():
    return os.makedirs(SAVE_DIRECTORY, exist_ok=True)


def save_image(img: str):
    comic_name = clip_url(img)
    print(f"Downloading: {comic_name}")
    f_name = SAVE_DIRECTORY / comic_name
    with requests.get("https:" + img, stream=True) as img, open(f_name, "wb") \
            as output:
        copyfileobj(img.raw, output)


def show_time(seconds: int) -> int:
    minutes, seconds = divmod(seconds, 60)
    hours, minutes = divmod(minutes, 60)
    time_elapsed = f"{hours:02d}:{minutes:02d}:{seconds:02d}" 
    return time_elapsed


def get_xkcd():
    show_logo()
    make_dir()

    collect_garbage = []
    latest_comic = get_penultimate(ARCHIVE)
    pages = get_number_of_pages(latest_comic)

    start = time.time()
    for page in reversed(range(latest_comic - pages + 1, latest_comic + 1)):
        print(f"Fetching page {page} out of {latest_comic}")
        try:
            url = get_images_from_page(f"{BASE_URL}{page}/")
            thread = threading.Thread(target=save_image, args=(url, ))
            thread.start()
        except (ValueError, AttributeError, requests.exceptions.MissingSchema):
            print(f"WARNING: Invalid comic image source url.")
            collect_garbage.append(f"{BASE_URL}{page}")
            continue
    thread.join()
    end = time.time()

    print(f"Downloaded {pages} comic(s) in {show_time(int(end - start))}.")

    if len(collect_garbage) > 0:
        print("However, was unable to download images for these pages:")
        print("\n".join(page for page in collect_garbage))


def main():
    get_xkcd()


if __name__ == '__main__':
    main()

As for the code below, I'd appreciate any feedback.

#!/usr/bin/python3

import os
import sys
import time
import threading
from pathlib import Path
from shutil import copyfileobj


import requests
from lxml import html


BASE_URL = "https://www.xkcd.com/"
ARCHIVE = "https://www.xkcd.com/archive"
SAVE_DIRECTORY = Path('xkcd_comics')
LOGO = """
       _           _                      
 tiny | |  image  | | downloader for
 __  _| | _____ __| |  ___ ___  _ __ ___  
 \ \/ / |/ / __/ _` | / __/ _ \| '_ ` _ \ 
  >  <|   < (_| (_| || (_| (_) | | | | | |
 /_/\_\_|\_\___\__,_(_)___\___/|_| |_| |_|
 version 0.1
"""


def show_logo():
    print(LOGO)


def fetch_url(url: str) -> requests.Response:
    return requests.get(url)


def head_option(values: list) -> str:
    return next(iter(values), None)


def get_penultimate(url: str) -> int:
    page = fetch_url(url)
    tree = html.fromstring(page.content)
    newest_comic = head_option(
        tree.xpath('//*[@id="middleContainer"]/a[1]/@href'))
    return int(newest_comic.replace("/", ""))


def get_images_from_page(url: str) -> str:
    page = fetch_url(url)
    tree = html.fromstring(page.content)
    return head_option(tree.xpath('//*[@id="comic"]//img/@src'))


def get_number_of_pages(latest_comic: int) -> int:
    print(f"There are {latest_comic} comics.")
    print(f"How many do you want to download? Type 0 to exit.")
    while True:
        try:
            number_of_comics = int(input(">> "))
        except ValueError:
            print("Error: Expected a number. Try again.")
            continue
        if number_of_comics > latest_comic or number_of_comics < 0:
            print("Error: Incorrect number of comics. Try again.")
            continue
        elif number_of_comics == 0:
            sys.exit()
        return number_of_comics


def clip_url(img: str) -> str:
    return img.rpartition("/")[-1]


def make_dir():
    return os.makedirs(SAVE_DIRECTORY, exist_ok=True)


def save_image(img: str):
    comic_name = clip_url(img)
    print(f"Downloading: {comic_name}")
    f_name = SAVE_DIRECTORY / comic_name
    with requests.get("https:" + img, stream=True) as img, open(f_name, "wb") \
            as output:
        copyfileobj(img.raw, output)


def show_time(seconds: int) -> int:
    minutes, seconds = divmod(seconds, 60)
    hours, minutes = divmod(minutes, 60)
    time_elapsed = f"{hours:02d}:{minutes:02d}:{seconds:02d}" 
    return time_elapsed


def get_xkcd():
    show_logo()
    make_dir()

    collect_garbage = []
    latest_comic = get_penultimate(ARCHIVE)
    pages = get_number_of_pages(latest_comic)

    start = time.time()
    for page in reversed(range(latest_comic - pages + 1, latest_comic + 1)):
        print(f"Fetching page {page} out of {latest_comic}")
        try:
            save_image(get_images_from_page(f"{BASE_URL}{page}/"))
        except (ValueError, AttributeError, requests.exceptions.MissingSchema):
            print(f"WARNING: Invalid comic image source url.")
            collect_garbage.append(f"{BASE_URL}{page}")
            continue
    end = time.time()

    print(f"Downloaded {pages} comic(s) in {show_time(int(end - start))}.")

    if len(collect_garbage) > 0:
        print("However, was unable to download images for these pages:")
        print("\n".join(page for page in collect_garbage))


def main():
    get_xkcd()


if __name__ == '__main__':
    main()

Tweeted twitter.com/StackCodeReview/status/1168403310396805121

occurred Sep 2, 2019 at 6:00

Became Hot Network Question

occurred Sep 2, 2019 at 5:18

Modified title

Link

edited Sep 1, 2019 at 18:48

baduker

1.4k
15
30

Tiny image scraper for xkcd.com

Source Link

asked Sep 1, 2019 at 18:42

baduker

1.4k
15
30

Loading

Stack Exchange Network

Return to Question

Tiny image scraper for xkcd.com