Source code for secml.utils.download_utils

"""
.. module:: DownloadUtils
   :synopsis: Collection of mixed utilities for downloading files

.. moduleauthor:: Marco Melis <marco.melis@unica.it>

"""
from __future__ import division
import sys
import requests
import hashlib
from io import open  # TODO: REMOVE AFTER TRANSITION TO PYTHON 3

from secml.utils import fm


[docs]def dl_file(url, output_dir, user=None, chunk_size=1024, md5_digest=None): """Download file from input url and store in output_dir. Parameters ---------- url : str Url of the file to download. output_dir : str Path to the directory where the file should be stored. If folder does not exists, will be created. user : str or None, optional String with the user[:password] if required for accessing url. chunk_size : int, optional Size of the data chunk to read from url in bytes. Default 1024. md5_digest : str or None, optional Expected MD5 digest of the downloaded file. If a different digest is computed, the downloaded file will be removed and ValueError is raised. """ # Parsing user string auth = tuple(user.split(':')) if user is not None else None # If no password is specified, use an empty string auth = (auth[0], '') if auth is not None and len(auth) == 1 else auth r = requests.get(url, auth=auth, stream=True) if r.status_code != 200: raise RuntimeError( "File is not available (error code {:})".format(r.status_code)) # Get file size (bytes) total_size = r.headers.get('content-length').strip() total_size = int(total_size) dl = 0 if chunk_size < 1: raise ValueError("chunk_size must be at least 1 byte") sys.stdout.write( "Downloading from `{:}` ({:} bytes)\n".format(url, total_size)) sys.stdout.flush() # Create output directory if not exists if not fm.folder_exist(output_dir): fm.make_folder(output_dir) # Build full path of output file out_path = fm.join(output_dir, url.split('/')[-1]) # Read data and store each chunk with open(out_path, 'wb') as f: for chunk in r.iter_content(chunk_size=chunk_size): if chunk: # filter out keep-alive new chunks f.write(chunk) # Report progress dl += len(chunk) done = int((50 * dl) / total_size) if sys.stdout.isatty() is True: # Provide real-time updates (if stdout is a tty) sys.stdout.write("\r[{:}{:}] {:}/{:}".format( '=' * done, ' ' * (50-done), dl, total_size)) sys.stdout.flush() sys.stdout.write("\nFile stored in `{:}`\n".format(out_path)) sys.stdout.flush() if md5_digest is not None and md5_digest != md5(out_path, chunk_size): fm.remove_file(out_path) # Remove the probably-corrupted file raise ValueError("Unexpected MD5 hash for the downloaded file.") return out_path
[docs]def md5(fname, blocksize=65536): """Generate RSA's MD5 digest for input file. Parameters ---------- fname : str Path to the file to parse blocksize : int Size in bytes of the file chunks to read. Default 65536. Returns ------- str MD5 hex digest of input file. """ hash_md5 = hashlib.md5() with open(fname, mode='rb') as f: for chunk in iter(lambda: f.read(blocksize), b""): hash_md5.update(chunk) return hash_md5.hexdigest()