Revisions to Python multi-connection downloader

Rollback to Revision 7

Source Link

edited Aug 8, 2021 at 13:12

13.9k
12
57
128

import re
import requests
import sys
import time
import validators
from collections import deque
from datetime import datetime, timedelta
from math import inf
from mmap import mmap
from pathlib import Path
from ping3 import ping
from reprint import output
from threading import Thread

def timestring(sec):
    sec = int(sec)
    m,a s= str(int(sec // 3600)).zfill(2)
    sec = divmodsec % 3600
    b = str(int(sec, // 60)).zfill(2)
    h, mc = divmodstr(m,int(sec % 60)).zfill(2)
    return f''{h:02d0}:{m:02d1}:{s:02d2}'.format(a, b, c)

class Downloaderdownloader:
    def __init__(self, url, filepath, num_connections=32, overwrite=False):
        self.mm = None
        self.count = 0
        self.recent = deque([0] * 1220, maxlen=12maxlen=20)
        self.download(url, filepath, num_connections, overwrite)
    
    def multidown(self, url, start, end):
        r = requests.get(url, headers={'range': 'bytes={0}-{1}'.format(start, end-1)}, stream=True)
        i = start
        for chunk in r.iter_content(1048576):
            if chunk:
                self.mm[i: i+len(chunk)] = chunk
                self.count += len(chunk)
                i += len(chunk)
    
    def singledown(self, url, path):
        with requests.get(url, stream=True) as r:
            with path.open('wb') as file:
                for chunk in r.iter_content(1048576):
                        if chunk:
                            self.count += len(chunk)
                            file.write(chunk)
    
    def download(self, url, filepath, num_connections=32, overwrite=False):
        singlethread = False
        threads = []
        bcontinue = False
        filepath = filepath.replace('\\', '/')
        if (not re.match('^[a-zA-Z]:/(((?![<>:"/|?*]).)+((?<![ .])/)?)*$', filepath) or 
            not Path(filepath[:3]).exists()):
            print('Invalid windows file path has been inputted, process will now stop.')
            return
        if not validators.url(url):
            print('Invalid url been inputted, process will now stop.')
            return
        if url.lower().startswith('ftp://'):
            print("`requests` module doesn't suport File Transfer Protocol, process will now stop")
            return
        path = Path(filepath)
        if not path.exists():
            bcontinue = True
        else:
            if path.is_file():
                if overwrite:
                    bcontinue = True
                else:
                    while True:
                        answer = input(f'`{filepath}` already exists, do you want to overwrite it? \n(Yes, No):').lower()
                        if answer in ['y', 'yes', 'n', 'no']:
                            if answer.startswith('y'):
                                bcontinue = True
                            break
                        else:
                            print('Invalid input detected, retaking input.')
        if not bcontinue:
            print(f'Overwritting {filepath} has been aborted, process will now stop.')
            return
        bcontinue = False
        server = url.split('/')[2]
        ok = ping(server, timeout=2)
        if ok == False:
            print('The server of the inputted url is non-existent, process will now stop.')
            return
        if ok:
            bcontinue = True
        if not ok:
            print('Connection has timed out, will reattempt to ping server 5 times.')
            for i in range(5):
                print(f'Reattempting to ping server, retrying {i + 1} out of 5')
                ok = ping(server, timeout=2)
                if ok:
                    print(f'Connection successful on retry {i + 1}, process will now continue.')
                    bcontinue = True
                    break
                else:
                    print(f'Retry {i + 1} out of 5 timed out' + (i != 4) * ', reattempting in 1 second.' + (i == 4) * '.')
                    time.sleep(1)
        if not bcontinue:
            print('Failed to connect server, connection timed out, process will now stop')
            return
        bcontinue = False
        head = requests.head(url)
        if head.status_code == 200:
            bcontinue = True
        else:
            for i in range(5):
                print(f'Serverf'Failed responceto isconnect invalidserver, retrying {i + 1} out of 5')
                head = requests.head(url)
                if head.status_code == 200:
                    print(f'Connection successful on retry {i + 1}, process will now continue.')
                    bcontinue = True
                    break
                else:
                    print(f'Retry {i + 1} out of 5 failed to access data' + (i != 4) * 'connect, reattempting in 1 second.' + (i == 4) * '.')
                    time.sleep(1)
        if not bcontinue:
            print("Can't establish a connection with"Connection accesscan't tobe dataestablished, can't download target file, process will now stop.")
            return
        folder = '/'.join(filepath.split('/')[:-1])
        Path(folder).mkdir(parents=True, exist_ok=True)
        headers = head.headers
        total = headers.get('content-length')
        if not total:
            print(f'Cannot find the total length of the content of {url}, the file will be downloaded using a single thread.')
            started = datetime.now()
            print('Task started on %s.' % started.strftime('%Y-%m-%d %H:%M:%S'))
            th = Thread(target=self.singledown, args=(url, path))
            threads.append(th)
            th.start()
            total = inf
            singlethread = True
        else:
            total = int(total)
            code = requests.head(url, headers={'range':'bytes=0-100'}).status_code
            if code != 206:
                print('Server does not support the `range` parameter, the file will be downloaded using a single thread.')
                started = datetime.now()
                print('Task started on %s.' % started.strftime('%Y-%m-%d %H:%M:%S'))
                th = Thread(target=self.singledown, args=(url, path))
                threads.append(th)
                th.start()
                singlethread = True
            else:
                path.touch()
                file = path.open(mode='wb')
                file.seek(total - 1)
                file.write(b'\0')
                file.close()
                file = path.open(mode='r+b')
                self.mm = mmap(file.fileno(), 0)
                segment = total / num_connections
                started = datetime.now()
                print('Task started on %s.' % started.strftime('%Y-%m-%d %H:%M:%S'))
                for i in range(num_connections):
                    th = Thread(target=self.multidown, args=(url, int(segment * i), int(segment * (i + 1))))
                    threads.append(th)
                    th.start()
        downloaded = 0
        totalMiB = total / 1048576
        speeds = []
        interval = 0.04025
        with output(initial_len=4, interval=0) as dynamic_print:
            while True:
                status = sum([i.is_alive() for i in threads])
                downloaded = self.count
                self.recent.append(downloaded)
                done = int(100 * downloaded / total)
                doneMiB = downloaded / 1048576
                gt0 = len([i for i in self.recent if i])
                if not gt0:
                    speed = 0
                else:
                    recent = list(self.recent)[12[20 - gt0:]
                    if len(recent) == 1:
                        speed = recent[0] / 1048576 / interval
                    else:
                        diff = [b - a for a, b in zip(recent, recent[1:])]
                        speed = sum(diff) / len(diff) / 1048576 / interval
                speeds.append(speed)
                nzspeeds = [i for i in speeds if i]
                if nzspeeds:
                    minspeed = min(nzspeeds)
                else:
                    minspeed = 0
                maxspeed = max(speeds)
                nowmeanspeed = datetime.nowsum(speeds)
                elapsed = (now -/ started).total_secondslen(speeds)
                meanspeed = downloaded / elapsed / 1048576
                remaining = totalMiB - doneMiB
                dynamic_print[0] = '[{0}{1}] {2}'.format(
                    '\u2588' * done, '\u00b7' * (100-done), str(done)) + '% completed'
                dynamic_print[1] = '{0:.2f} MiB downloaded, {1:.2f} MiB total, {2:.2f} MiB remaining, download speed: {3:.2f} MiB/s'.format(
                    doneMiB, totalMiB, remaining, speed)
                now = datetime.now()
                elapsed = timestring((now - started).seconds)
                if speedmeanspeed and total != inf:
                    eta = timestring(remaining / speedmeanspeed)
                else:
                    eta = '99:59:59'
                dynamic_print[2] = 'Minimum speed: {0:.2f} MiB/s, average speed: {1:.2f} MiB/s, maximum speed: {2:.2f} MiB/s'.format(minspeed, meanspeed, maxspeed)
                dynamic_print[3] = 'Task started on {0}, {1} elapsed, ETA: {2}'.format(
                    started.strftime('%Y-%m-%d %H:%M:%S'), timestring(elapsed), eta)
                if status == 0:
                    ended = datetime.now()
                    if not singlethread:
                        self.mm.close()
                    break
                time.sleep(interval)
        time_spent = (ended - started).total_seconds()seconds
        meanspeed = total / time_spentsum(speeds) / 1048576len(speeds)
        print('Task completed on {0}, total time elapsed: {1}, average speed: {2:.2f} MiB/s'.format(
            ended.strftime('%Y-%m-%d %H:%M:%S'), timestring(time_spent), meanspeed))

if __name__ == '__main__':
    d = Downloader()
    d.downloaddownloader(*sys.argv[1:])

~~Final~~ Final update:

Update

This update didn't do much change to the logic of the code, the actual download methods remain unchanged, I just added a few new checks to validate the url and find whether the server exists or not, check connectivity with the server, and stop execution if the process cannot proceed. And I have re-implemented the average speed and eta to make them more logical, and used .total_seconds() method instead of .seconds in calculations.

The changes are small and the existing answer doesn't actually improve the performance of the code itself, so please don't revert this edit.

Currently this script doesn't support FTP because requests doesn't support FTP, but with small tweaks, using urllib2 to download over FTP is trivial, but I really can't figure out how can I pause and resume downloads, and send a signal to stop the download.

In my testing I have encountered this:

[████████████████████████████████████████████████████████████████████████████████████████████········] 92% completed
1374.16 MiB downloaded, 1489.83 MiB total, 115.67 MiB remaining, download speed: 0.00 MiB/s
Minimum speed: 1.27 MiB/s, average speed: 2.88 MiB/s, maximum speed: 54.55 MiB/s
Task started on 2021-08-08 17:25:05, 00:07:57 elapsed, ETA: 99:59:59

I am in China and I never ever surf Chinese net, I always browse English websites and download "indescribable" stuff from "non-black-and-white" English websites, and surfing the international internet without a VPN is extremely difficult, almost every site times out, even Wikipedia is inaccessible without a VPN, I am using VPN to access this very site right now...

And here is the thing, the GFW is actively throttling VPN connections and generally VPN slows download speed down because of added encryption and network routes (nevertheless in this case, VPN actually boosts download speed without slowing it down, because without it download speed would be practically zero), the connections will often be unstable, and the download can come to a halt, the chance gets bigger the closer the download is to completion, and the download will never complete without refreshing the connections...

I hadn't encountered this problem yesterday, but today the tests have all halted, must be connected to Tokyo Olympics, however in my tests with Free Download Manager (not trying to advertise it or whatever), with the same download links and network condition, the download speed can indeed drop to zero nearing completion, but it never halts for long and the downloads have always completed, I guess because Free Download Manager refreshes download links when connection becomes poor.

So how can I implement a pause\resume\stop feature, and make the downloader automatically pause and resume (refresh) a connection if download speed is locked at 0 for a certain time period (say 5 seconds)?

Well, in case if any one really want to know what getdownlink does, it isn't included in the scope of this review, but you can view it in Google Drive, first you need this file: getcookie.py, then getdownlink.py is here, and you will need a config.py in order to make it achieve its full potential.

config.py template:

USERNAME = 'Anonymous'
PASSWORD = 'Qwerty123456'

(of course the above account isn't my, but if you do manage to log in the website involved in the scripts using that login, well coincidences happen)

There will be two more constants added to config.py added to the file after first execution: COOKIES and TOKEN.

Basically, it downloads music from this website: https://music.163.com straight using its internal API without using its ****** client software, with one song id (an integer like this: 36607898, you can get it from links of song pages: https://music.163.com/#/song?id=36607898, or scrap it using xpath), and you will need an account to download higher quality (320kbps mp3) songs at all, otherwise you can only download normal quality songs (96kbps mp3), and many, many songs require VIP membership to be downloaded (it costs at least 20RMB, or 3.08USD per month), and even with VIP, many more songs still can't be downloaded due to copyright issues.

I used selenium to login because using requests I will need to pass checkToken parameter, otherwise it isn't possible to login, but frankly selenium is much slower than requests and the xpaths can change (the website changes those to impede scrapers), but again, I don't know a thing about javascript and the checkToken is very hard to crack, but perhaps someone with extensive knowledge in javascript will fare better, the script where the functions are from is located (currently) here: https://s3.music.126.net/web/s/core_596f2c1e5a5d58c7994993068862de4e.js,

should that address change, you can see it here:

core.js, if anyone does manage to crack generation of checkToken parameter, please contact me via email, my email is same as the account that shared these files.

The script (downloader) will be mainly used in my GUI program (that manages the songs I scraped from said website) to download the songs using links retrieved by getdownlink.

By the way, you will need Chinese IP address to download the songs, because the website blocks foreign address from downloading them.

import re
import requests
import sys
import time
import validators
from collections import deque
from datetime import datetime, timedelta
from math import inf
from mmap import mmap
from pathlib import Path
from ping3 import ping
from reprint import output
from threading import Thread

def timestring(sec):
    sec = int(sec)
    m, s = divmod(sec, 60)
    h, m = divmod(m, 60)
    return f'{h:02d}:{m:02d}:{s:02d}'

class Downloader:
    def __init__(self):
        self.mm = None
        self.count = 0
        self.recent = deque([0] * 12, maxlen=12)
    
    def multidown(self, url, start, end):
        r = requests.get(url, headers={'range': 'bytes={0}-{1}'.format(start, end-1)}, stream=True)
        i = start
        for chunk in r.iter_content(1048576):
            if chunk:
                self.mm[i: i+len(chunk)] = chunk
                self.count += len(chunk)
                i += len(chunk)
    
    def singledown(self, url, path):
        with requests.get(url, stream=True) as r:
            with path.open('wb') as file:
                for chunk in r.iter_content(1048576):
                    if chunk:
                        self.count += len(chunk)
                        file.write(chunk)
    
    def download(self, url, filepath, num_connections=32, overwrite=False):
        singlethread = False
        threads = []
        bcontinue = False
        filepath = filepath.replace('\\', '/')
        if (not re.match('^[a-zA-Z]:/(((?![<>:"/|?*]).)+((?<![ .])/)?)*$', filepath) or 
            not Path(filepath[:3]).exists()):
            print('Invalid windows file path has been inputted, process will now stop.')
            return
        if not validators.url(url):
            print('Invalid url been inputted, process will now stop.')
            return
        if url.lower().startswith('ftp://'):
            print("`requests` module doesn't suport File Transfer Protocol, process will now stop")
            return
        path = Path(filepath)
        if not path.exists():
            bcontinue = True
        else:
            if path.is_file():
                if overwrite:
                    bcontinue = True
                else:
                    while True:
                        answer = input(f'`{filepath}` already exists, do you want to overwrite it? \n(Yes, No):').lower()
                        if answer in ['y', 'yes', 'n', 'no']:
                            if answer.startswith('y'):
                                bcontinue = True
                            break
                        else:
                            print('Invalid input detected, retaking input.')
        if not bcontinue:
            print(f'Overwritting {filepath} has been aborted, process will now stop.')
            return
        bcontinue = False
        server = url.split('/')[2]
        ok = ping(server, timeout=2)
        if ok == False:
            print('The server of the inputted url is non-existent, process will now stop.')
            return
        if ok:
            bcontinue = True
        if not ok:
            print('Connection has timed out, will reattempt to ping server 5 times.')
            for i in range(5):
                print(f'Reattempting to ping server, retrying {i + 1} out of 5')
                ok = ping(server, timeout=2)
                if ok:
                    print(f'Connection successful on retry {i + 1}, process will now continue.')
                    bcontinue = True
                    break
                else:
                    print(f'Retry {i + 1} out of 5 timed out' + (i != 4) * ', reattempting in 1 second.' + (i == 4) * '.')
                    time.sleep(1)
        if not bcontinue:
            print('Failed to connect server, connection timed out, process will now stop')
            return
        bcontinue = False
        head = requests.head(url)
        if head.status_code == 200:
            bcontinue = True
        else:
            for i in range(5):
                print(f'Server responce is invalid, retrying {i + 1} out of 5')
                head = requests.head(url)
                if head.status_code == 200:
                    print(f'Connection successful on retry {i + 1}, process will now continue.')
                    bcontinue = True
                    break
                else:
                    print(f'Retry {i + 1} out of 5 failed to access data' + (i != 4) * ', reattempting in 1 second.' + (i == 4) * '.')
                    time.sleep(1)
        if not bcontinue:
            print("Can't establish a connection with access to data, can't download target file, process will now stop.")
            return
        folder = '/'.join(filepath.split('/')[:-1])
        Path(folder).mkdir(parents=True, exist_ok=True)
        headers = head.headers
        total = headers.get('content-length')
        if not total:
            print(f'Cannot find the total length of the content of {url}, the file will be downloaded using a single thread.')
            started = datetime.now()
            print('Task started on %s.' % started.strftime('%Y-%m-%d %H:%M:%S'))
            th = Thread(target=self.singledown, args=(url, path))
            threads.append(th)
            th.start()
            total = inf
            singlethread = True
        else:
            total = int(total)
            code = requests.head(url, headers={'range':'bytes=0-100'}).status_code
            if code != 206:
                print('Server does not support the `range` parameter, the file will be downloaded using a single thread.')
                started = datetime.now()
                print('Task started on %s.' % started.strftime('%Y-%m-%d %H:%M:%S'))
                th = Thread(target=self.singledown, args=(url, path))
                threads.append(th)
                th.start()
                singlethread = True
            else:
                path.touch()
                file = path.open(mode='wb')
                file.seek(total - 1)
                file.write(b'\0')
                file.close()
                file = path.open(mode='r+b')
                self.mm = mmap(file.fileno(), 0)
                segment = total / num_connections
                started = datetime.now()
                print('Task started on %s.' % started.strftime('%Y-%m-%d %H:%M:%S'))
                for i in range(num_connections):
                    th = Thread(target=self.multidown, args=(url, int(segment * i), int(segment * (i + 1))))
                    threads.append(th)
                    th.start()
        downloaded = 0
        totalMiB = total / 1048576
        speeds = []
        interval = 0.04
        with output(initial_len=4, interval=0) as dynamic_print:
            while True:
                status = sum([i.is_alive() for i in threads])
                downloaded = self.count
                self.recent.append(downloaded)
                done = int(100 * downloaded / total)
                doneMiB = downloaded / 1048576
                gt0 = len([i for i in self.recent if i])
                if not gt0:
                    speed = 0
                else:
                    recent = list(self.recent)[12 - gt0:]
                    if len(recent) == 1:
                        speed = recent[0] / 1048576 / interval
                    else:
                        diff = [b - a for a, b in zip(recent, recent[1:])]
                        speed = sum(diff) / len(diff) / 1048576 / interval
                speeds.append(speed)
                nzspeeds = [i for i in speeds if i]
                if nzspeeds:
                    minspeed = min(nzspeeds)
                else:
                    minspeed = 0
                maxspeed = max(speeds)
                now = datetime.now()
                elapsed = (now - started).total_seconds()
                meanspeed = downloaded / elapsed / 1048576
                remaining = totalMiB - doneMiB
                dynamic_print[0] = '[{0}{1}] {2}'.format(
                    '\u2588' * done, '\u00b7' * (100-done), str(done)) + '% completed'
                dynamic_print[1] = '{0:.2f} MiB downloaded, {1:.2f} MiB total, {2:.2f} MiB remaining, download speed: {3:.2f} MiB/s'.format(
                    doneMiB, totalMiB, remaining, speed)
                if speed and total != inf:
                    eta = timestring(remaining / speed)
                else:
                    eta = '99:59:59'
                dynamic_print[2] = 'Minimum speed: {0:.2f} MiB/s, average speed: {1:.2f} MiB/s, maximum speed: {2:.2f} MiB/s'.format(minspeed, meanspeed, maxspeed)
                dynamic_print[3] = 'Task started on {0}, {1} elapsed, ETA: {2}'.format(
                    started.strftime('%Y-%m-%d %H:%M:%S'), timestring(elapsed), eta)
                if status == 0:
                    ended = datetime.now()
                    if not singlethread:
                        self.mm.close()
                    break
                time.sleep(interval)
        time_spent = (ended - started).total_seconds()
        meanspeed = total / time_spent / 1048576
        print('Task completed on {0}, total time elapsed: {1}, average speed: {2:.2f} MiB/s'.format(
            ended.strftime('%Y-%m-%d %H:%M:%S'), timestring(time_spent), meanspeed))

if __name__ == '__main__':
    d = Downloader()
    d.download(*sys.argv[1:])

~~Final~~ update:

Update

This update didn't do much change to the logic of the code, the actual download methods remain unchanged, I just added a few new checks to validate the url and find whether the server exists or not, check connectivity with the server, and stop execution if the process cannot proceed. And I have re-implemented the average speed and eta to make them more logical, and used .total_seconds() method instead of .seconds in calculations.

The changes are small and the existing answer doesn't actually improve the performance of the code itself, so please don't revert this edit.

Currently this script doesn't support FTP because requests doesn't support FTP, but with small tweaks, using urllib2 to download over FTP is trivial, but I really can't figure out how can I pause and resume downloads, and send a signal to stop the download.

In my testing I have encountered this:

[████████████████████████████████████████████████████████████████████████████████████████████········] 92% completed
1374.16 MiB downloaded, 1489.83 MiB total, 115.67 MiB remaining, download speed: 0.00 MiB/s
Minimum speed: 1.27 MiB/s, average speed: 2.88 MiB/s, maximum speed: 54.55 MiB/s
Task started on 2021-08-08 17:25:05, 00:07:57 elapsed, ETA: 99:59:59

I am in China and I never ever surf Chinese net, I always browse English websites and download "indescribable" stuff from "non-black-and-white" English websites, and surfing the international internet without a VPN is extremely difficult, almost every site times out, even Wikipedia is inaccessible without a VPN, I am using VPN to access this very site right now...

And here is the thing, the GFW is actively throttling VPN connections and generally VPN slows download speed down because of added encryption and network routes (nevertheless in this case, VPN actually boosts download speed without slowing it down, because without it download speed would be practically zero), the connections will often be unstable, and the download can come to a halt, the chance gets bigger the closer the download is to completion, and the download will never complete without refreshing the connections...

I hadn't encountered this problem yesterday, but today the tests have all halted, must be connected to Tokyo Olympics, however in my tests with Free Download Manager (not trying to advertise it or whatever), with the same download links and network condition, the download speed can indeed drop to zero nearing completion, but it never halts for long and the downloads have always completed, I guess because Free Download Manager refreshes download links when connection becomes poor.

So how can I implement a pause\resume\stop feature, and make the downloader automatically pause and resume (refresh) a connection if download speed is locked at 0 for a certain time period (say 5 seconds)?

Well, in case if any one really want to know what getdownlink does, it isn't included in the scope of this review, but you can view it in Google Drive, first you need this file: getcookie.py, then getdownlink.py is here, and you will need a config.py in order to make it achieve its full potential.

config.py template:

USERNAME = 'Anonymous'
PASSWORD = 'Qwerty123456'

(of course the above account isn't my, but if you do manage to log in the website involved in the scripts using that login, well coincidences happen)

There will be two more constants added to config.py added to the file after first execution: COOKIES and TOKEN.

Basically, it downloads music from this website: https://music.163.com straight using its internal API without using its ****** client software, with one song id (an integer like this: 36607898, you can get it from links of song pages: https://music.163.com/#/song?id=36607898, or scrap it using xpath), and you will need an account to download higher quality (320kbps mp3) songs at all, otherwise you can only download normal quality songs (96kbps mp3), and many, many songs require VIP membership to be downloaded (it costs at least 20RMB, or 3.08USD per month), and even with VIP, many more songs still can't be downloaded due to copyright issues.

I used selenium to login because using requests I will need to pass checkToken parameter, otherwise it isn't possible to login, but frankly selenium is much slower than requests and the xpaths can change (the website changes those to impede scrapers), but again, I don't know a thing about javascript and the checkToken is very hard to crack, but perhaps someone with extensive knowledge in javascript will fare better, the script where the functions are from is located (currently) here: https://s3.music.126.net/web/s/core_596f2c1e5a5d58c7994993068862de4e.js,

should that address change, you can see it here:

core.js, if anyone does manage to crack generation of checkToken parameter, please contact me via email, my email is same as the account that shared these files.

The script (downloader) will be mainly used in my GUI program (that manages the songs I scraped from said website) to download the songs using links retrieved by getdownlink.

By the way, you will need Chinese IP address to download the songs, because the website blocks foreign address from downloading them.

import re
import requests
import sys
import time
from collections import deque
from datetime import datetime, timedelta
from math import inf
from mmap import mmap
from pathlib import Path
from reprint import output
from threading import Thread

def timestring(sec):
    sec = int(sec)
    a = str(int(sec // 3600)).zfill(2)
    sec = sec % 3600
    b = str(int(sec // 60)).zfill(2)
    c = str(int(sec % 60)).zfill(2)
    return '{0}:{1}:{2}'.format(a, b, c)

class downloader:
    def __init__(self, url, filepath, num_connections=32, overwrite=False):
        self.mm = None
        self.count = 0
        self.recent = deque([0] * 20, maxlen=20)
        self.download(url, filepath, num_connections, overwrite)
    
    def multidown(self, url, start, end):
        r = requests.get(url, headers={'range': 'bytes={0}-{1}'.format(start, end-1)}, stream=True)
        i = start
        for chunk in r.iter_content(1048576):
            if chunk:
                self.mm[i: i+len(chunk)] = chunk
                self.count += len(chunk)
                i += len(chunk)
    
    def singledown(self, url, path):
        with requests.get(url, stream=True) as r:
            with path.open('wb') as file:
                for chunk in r.iter_content(1048576):
                        if chunk:
                            self.count += len(chunk)
                            file.write(chunk)
    
    def download(self, url, filepath, num_connections=32, overwrite=False):
        singlethread = False
        threads = []
        bcontinue = False
        filepath = filepath.replace('\\', '/')
        if (not re.match('^[a-zA-Z]:/(((?![<>:"/|?*]).)+((?<![ .])/)?)*$', filepath) or 
            not Path(filepath[:3]).exists()):
            print('Invalid windows file path has been inputted, process will now stop.')
            return
        path = Path(filepath)
        if not path.exists():
            bcontinue = True
        else:
            if path.is_file():
                if overwrite:
                    bcontinue = True
                else:
                    while True:
                        answer = input(f'`{filepath}` already exists, do you want to overwrite it? \n(Yes, No):').lower()
                        if answer in ['y', 'yes', 'n', 'no']:
                            if answer.startswith('y'):
                                bcontinue = True
                            break
                        else:
                            print('Invalid input detected, retaking input.')
        if not bcontinue:
            print(f'Overwritting {filepath} has been aborted, process will now stop.')
            return
        bcontinue = False
        head = requests.head(url)
        if head.status_code == 200:
            bcontinue = True
        else:
            for i in range(5):
                print(f'Failed to connect server, retrying {i + 1} out of 5')
                head = requests.head(url)
                if head.status_code == 200:
                    print(f'Connection successful on retry {i + 1}, process will now continue.')
                    bcontinue = True
                    break
                else:
                    print(f'Retry {i + 1} out of 5 failed to connect, reattempting in 1 second.')
                    time.sleep(1)
        if not bcontinue:
            print("Connection can't be established, can't download target file, process will now stop.")
            return
        folder = '/'.join(filepath.split('/')[:-1])
        Path(folder).mkdir(parents=True, exist_ok=True)
        headers = head.headers
        total = headers.get('content-length')
        if not total:
            print(f'Cannot find the total length of the content of {url}, the file will be downloaded using a single thread.')
            started = datetime.now()
            print('Task started on %s.' % started.strftime('%Y-%m-%d %H:%M:%S'))
            th = Thread(target=self.singledown, args=(url, path))
            threads.append(th)
            th.start()
            total = inf
            singlethread = True
        else:
            total = int(total)
            code = requests.head(url, headers={'range':'bytes=0-100'}).status_code
            if code != 206:
                print('Server does not support the `range` parameter, the file will be downloaded using a single thread.')
                started = datetime.now()
                print('Task started on %s.' % started.strftime('%Y-%m-%d %H:%M:%S'))
                th = Thread(target=self.singledown, args=(url, path))
                threads.append(th)
                th.start()
                singlethread = True
            else:
                path.touch()
                file = path.open(mode='wb')
                file.seek(total - 1)
                file.write(b'\0')
                file.close()
                file = path.open(mode='r+b')
                self.mm = mmap(file.fileno(), 0)
                segment = total / num_connections
                started = datetime.now()
                print('Task started on %s.' % started.strftime('%Y-%m-%d %H:%M:%S'))
                for i in range(num_connections):
                    th = Thread(target=self.multidown, args=(url, int(segment * i), int(segment * (i + 1))))
                    threads.append(th)
                    th.start()
        downloaded = 0
        totalMiB = total / 1048576
        speeds = []
        interval = 0.025
        with output(initial_len=4, interval=0) as dynamic_print:
            while True:
                status = sum([i.is_alive() for i in threads])
                downloaded = self.count
                self.recent.append(downloaded)
                done = int(100 * downloaded / total)
                doneMiB = downloaded / 1048576
                gt0 = len([i for i in self.recent if i])
                if not gt0:
                    speed = 0
                else:
                    recent = list(self.recent)[20 - gt0:]
                    if len(recent) == 1:
                        speed = recent[0] / 1048576 / interval
                    else:
                        diff = [b - a for a, b in zip(recent, recent[1:])]
                        speed = sum(diff) / len(diff) / 1048576 / interval
                speeds.append(speed)
                nzspeeds = [i for i in speeds if i]
                if nzspeeds:
                    minspeed = min(nzspeeds)
                else:
                    minspeed = 0
                maxspeed = max(speeds)
                meanspeed = sum(speeds) / len(speeds)
                remaining = totalMiB - doneMiB
                dynamic_print[0] = '[{0}{1}] {2}'.format(
                    '\u2588' * done, '\u00b7' * (100-done), str(done)) + '% completed'
                dynamic_print[1] = '{0:.2f} MiB downloaded, {1:.2f} MiB total, {2:.2f} MiB remaining, download speed: {3:.2f} MiB/s'.format(
                    doneMiB, totalMiB, remaining, speed)
                now = datetime.now()
                elapsed = timestring((now - started).seconds)
                if meanspeed and total != inf:
                    eta = timestring(remaining / meanspeed)
                else:
                    eta = '99:59:59'
                dynamic_print[2] = 'Minimum speed: {0:.2f} MiB/s, average speed: {1:.2f} MiB/s, maximum speed: {2:.2f} MiB/s'.format(minspeed, meanspeed, maxspeed)
                dynamic_print[3] = 'Task started on {0}, {1} elapsed, ETA: {2}'.format(
                    started.strftime('%Y-%m-%d %H:%M:%S'), elapsed, eta)
                if status == 0:
                    ended = datetime.now()
                    if not singlethread:
                        self.mm.close()
                    break
                time.sleep(interval)
        time_spent = (ended - started).seconds
        meanspeed = sum(speeds) / len(speeds)
        print('Task completed on {0}, total time elapsed: {1}, average speed: {2:.2f} MiB/s'.format(
            ended.strftime('%Y-%m-%d %H:%M:%S'), timestring(time_spent), meanspeed))

if __name__ == '__main__':
    d = downloader(*sys.argv[1:])

Final update:

added 2376 characters in body

Source Link

edited Aug 8, 2021 at 11:35

Ξένη Γήινος

3.7k
13
48

Basically, it downloads music from this website: https://music.163.com straight using its internal API without using its ****** client software, with one song id (an integer like this: 36607898, you can get it from links of song pages: https://music.163.com/#/song?id=36607898, or scrap it using xpath), and you will need an account to download higher quality (320kbps mp3) songs at all, orotherwise you can only download normal quality songs (96kbps mp3), and many, many songs require VIP membership to be downloaded (it costs at least 20RMB, or 3.08USD per month), and even with VIP, many more songs still can't be downloaded due to copyright issues.

The script (downloader) will be mainly used in my GUI program (that manages the songs I scraped from said website) to download the songs using links retrieved by getdownlink.

By the way, you will need Chinese IP address to download the songs, because the website blocks foreign address from downloading them.

added 2376 characters in body

Source Link

edited Aug 8, 2021 at 11:30

Ξένη Γήινος

3.7k
13
48

Well, in case if any one really want to know what getdownlink does, it isn't included in the scope of this review, but you can view it in Google Drive, first you need this file: getcookie.py, then getdownlink.py is here, and you will need a config.py in order to make it achieve its full potential.

config.py template:

USERNAME = 'Anonymous'
PASSWORD = 'Qwerty123456'

(of course the above account isn't my, but if you do manage to log in the website involved in the scripts using that login, well coincidences happen)

There will be two more constants added to config.py added to the file after first execution: COOKIES and TOKEN.

Basically, it downloads music from this website: https://music.163.com straight using its internal API without using its ****** client software, with one song id (an integer like this: 36607898, you can get it from links of song pages: https://music.163.com/#/song?id=36607898, or scrap it using xpath), and you will need an account to download higher quality (320kbps mp3) songs at all, or you can only download normal quality songs (96kbps mp3), and many, many songs require VIP membership to be downloaded (it costs at least 20RMB, or 3.08USD per month), and even with VIP, many more songs still can't be downloaded due to copyright issues.

I used selenium to login because using requests I will need to pass checkToken parameter, otherwise it isn't possible to login, but frankly selenium is much slower than requests and the xpaths can change (the website changes those to impede scrapers), but again, I don't know a thing about javascript and the checkToken is very hard to crack, but perhaps someone with extensive knowledge in javascript will fare better, the script where the functions are from is located (currently) here: https://s3.music.126.net/web/s/core_596f2c1e5a5d58c7994993068862de4e.js,

should that address change, you can see it here:

core.js, if anyone does manage to crack generation of checkToken parameter, please contact me via email, my email is same as the account that shared these files.

Well, in case if any one really want to know what getdownlink does, it isn't included in the scope of this review, but you can view it in Google Drive, first you need this file: getcookie.py, then getdownlink.py is here, and you will need a config.py in order to make it achieve its full potential.

config.py template:

USERNAME = 'Anonymous'
PASSWORD = 'Qwerty123456'

(of course the above account isn't my, but if you do manage to log in the website involved in the scripts using that login, well coincidences happen)

There will be two more constants added to config.py added to the file after first execution: COOKIES and TOKEN.

Basically, it downloads music from this website: https://music.163.com straight using its internal API without using its ****** client software, with one song id (an integer like this: 36607898, you can get it from links of song pages: https://music.163.com/#/song?id=36607898, or scrap it using xpath), and you will need an account to download higher quality (320kbps mp3) songs at all, or you can only download normal quality songs (96kbps mp3), and many, many songs require VIP membership to be downloaded (it costs at least 20RMB, or 3.08USD per month), and even with VIP, many more songs still can't be downloaded due to copyright issues.

I used selenium to login because using requests I will need to pass checkToken parameter, otherwise it isn't possible to login, but frankly selenium is much slower than requests and the xpaths can change (the website changes those to impede scrapers), but again, I don't know a thing about javascript and the checkToken is very hard to crack, but perhaps someone with extensive knowledge in javascript will fare better, the script where the functions are from is located (currently) here: https://s3.music.126.net/web/s/core_596f2c1e5a5d58c7994993068862de4e.js,

should that address change, you can see it here:

core.js, if anyone does manage to crack generation of checkToken parameter, please contact me via email, my email is same as the account that shared these files.

added 4180 characters in body

Source Link

edited Aug 8, 2021 at 10:38

Ξένη Γήινος

3.7k
13
48

Loading

Tweeted twitter.com/StackCodeReview/status/1424022530638155778

occurred Aug 7, 2021 at 15:00

added 89 characters in body

Source Link

edited Aug 7, 2021 at 12:14

Ξένη Γήινος

3.7k
13
48

Loading

added 588 characters in body

Source Link

edited Aug 7, 2021 at 11:36

Ξένη Γήινος

3.7k
13
48

Loading

added 351 characters in body

Source Link

edited Aug 7, 2021 at 10:41

Ξένη Γήινος

3.7k
13
48

Loading

added 757 characters in body

Source Link

edited Aug 7, 2021 at 10:27

Ξένη Γήινος

3.7k
13
48

Loading

added 160 characters in body

Source Link

edited Aug 7, 2021 at 9:28

Ξένη Γήινος

3.7k
13
48

Loading

added 1656 characters in body

Source Link

edited Aug 7, 2021 at 9:01

Ξένη Γήινος

3.7k
13
48

Loading

Source Link

asked Aug 7, 2021 at 7:44

Ξένη Γήινος

3.7k
13
48

Loading

Stack Exchange Network

Return to Question

Update

Update