3
\$\begingroup\$

I recently posted this script:

VIM colors downloader in Python

But since I'm not allowed to update the code there, I wanted to get an idea on this version, that uses multiprocessing:

import os
import json
import requests
from bs4 import BeautifulSoup
from multiprocessing import Process


class Spider:
    def __init__(self, total_pages=40):
        self.base_url = "http://vimcolors.com"
        self.total_pages = total_pages
        self.download_dir = 'colors'
        # If we don't have the download directory
        if not os.path.isdir(self.download_dir):
            print(self.download_dir, 'does not exist, trying to create it...')
            # create it...
            os.mkdir(self.download_dir)

    def download(self, name, url):
        try:
            full_path = os.path.join(self.download_dir, name)
            # If we have already downloaded this file, just skip
            if os.path.isfile(full_path):
                raise Exception('File: {} already exists; skipping.'.format(name))

            # Get the response
            response = requests.get(url)
            if response.status_code == 404:
                raise Exception('File not found: {}'.format(url))

            # Try downloading the file
            with open(full_path, 'wb') as file_path:
                file_path.write(response.content)
        except Exception as e:
            print(e)
        else:
            print('Downloaded', name)

    def crawl(self):
        def repo_formatter(scheme):
            base_url = scheme['github_repo']['address'].replace('github.com', 'raw.githubusercontent.com')
            return '{}/master/colors/'.format(base_url)

        # Loop over all the pages
        for page in range(self.total_pages):
            page_source = requests.get(self.base_url, params={'page': page + 1})
            plain_text = page_source.text
            soup = BeautifulSoup(plain_text, 'lxml')

            # Get the data
            json_data = json.loads(soup.find('div', {'id': 'data'}).attrs['data-colorschemes'])

            # Download the files
            for data in json_data['colorschemes']:
                file_name = data['name'] + '.vim'
                process = Process(target=self.download, args=(file_name, repo_formatter(data),))
                process.daemon = True
                process.start()

        print('\n\nAll done.')


colors_spider = Spider()
colors_spider.crawl()

Everything is working. I just wanted to know, is multiprocessing a good choice in this case?

\$\endgroup\$

0

You must log in to answer this question.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.