From e17af5926156c1f9f9d793b29bc360bece97a5a0 Mon Sep 17 00:00:00 2001 From: Nikita Skakun Date: Thu, 30 Mar 2023 00:21:34 -0700 Subject: [PATCH 1/8] Add support for resuming downloads This commit adds the ability to resume interrupted downloads by adding a new function to the downloader module. The function uses the HTTP Range header to fetch only the remaining part of a file that wasn't downloaded yet. --- download-model.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/download-model.py b/download-model.py index 7e5f61b2..94524f76 100644 --- a/download-model.py +++ b/download-model.py @@ -27,8 +27,23 @@ parser.add_argument('--output', type=str, default=None, help='The folder where t args = parser.parse_args() def get_file(url, output_folder): - r = requests.get(url, stream=True) - with open(output_folder / Path(url.rsplit('/', 1)[1]), 'wb') as f: + filename = Path(url.rsplit('/', 1)[1]) + output_path = output_folder / filename + if output_path.exists(): + # Check if the file has already been downloaded completely + r = requests.head(url) + total_size = int(r.headers.get('content-length', 0)) + if output_path.stat().st_size == total_size: + return + # Otherwise, resume the download from where it left off + headers = {'Range': f'bytes={output_path.stat().st_size}-'} + mode = 'ab' + else: + headers = {} + mode = 'wb' + + r = requests.get(url, stream=True, headers=headers) + with open(output_path, mode) as f: total_size = int(r.headers.get('content-length', 0)) block_size = 1024 with tqdm.tqdm(total=total_size, unit='iB', unit_scale=True, bar_format='{l_bar}{bar}| {n_fmt:6}/{total_fmt:6} {rate_fmt:6}') as t: @@ -149,7 +164,7 @@ def get_download_links_from_huggingface(model, branch): return links, sha256, is_lora def download_files(file_list, output_folder, num_threads=8): - thread_map(lambda url: get_file(url, output_folder), file_list, max_workers=num_threads) + thread_map(lambda url: get_file(url, output_folder), file_list, max_workers=num_threads, disable=True) if __name__ == '__main__': model = args.MODEL From 8c590c2362d4ee783b41a93f6c03f8e19dc40657 Mon Sep 17 00:00:00 2001 From: Nikita Skakun Date: Thu, 30 Mar 2023 00:42:19 -0700 Subject: [PATCH 2/8] Added a 'clean' flag to not resume download. --- download-model.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/download-model.py b/download-model.py index 94524f76..ed8a047d 100644 --- a/download-model.py +++ b/download-model.py @@ -24,12 +24,13 @@ parser.add_argument('--branch', type=str, default='main', help='Name of the Git parser.add_argument('--threads', type=int, default=1, help='Number of files to download simultaneously.') parser.add_argument('--text-only', action='store_true', help='Only download text files (txt/json).') parser.add_argument('--output', type=str, default=None, help='The folder where the model should be saved.') +parser.add_argument('--clean', action='store_true', help='Does not resume the previous download.') args = parser.parse_args() def get_file(url, output_folder): filename = Path(url.rsplit('/', 1)[1]) output_path = output_folder / filename - if output_path.exists(): + if output_path.exists() and not args.clean: # Check if the file has already been downloaded completely r = requests.head(url) total_size = int(r.headers.get('content-length', 0)) From 297ac051d91c52ee6e0a39bc745d84f85a305346 Mon Sep 17 00:00:00 2001 From: Nikita Skakun Date: Thu, 30 Mar 2023 02:34:19 -0700 Subject: [PATCH 3/8] Added sha256 validation of model files. --- download-model.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/download-model.py b/download-model.py index ed8a047d..0b3f16d7 100644 --- a/download-model.py +++ b/download-model.py @@ -17,6 +17,7 @@ from pathlib import Path import requests import tqdm from tqdm.contrib.concurrent import thread_map +import hashlib parser = argparse.ArgumentParser() parser.add_argument('MODEL', type=str, default=None, nargs='?') @@ -213,4 +214,17 @@ if __name__ == '__main__': # Downloading the files print(f"Downloading the model to {output_folder}") download_files(links, output_folder, args.threads) - print() + + print('\n') + # Validate the checksums + validated = True + for i in range(len(sha256)): + with open(output_folder / sha256[i][0], "rb") as f: + bytes = f.read() + file_hash = hashlib.sha256(bytes).hexdigest() + if file_hash != sha256[i][1]: + print(f'[!] Checksum for {sha256[i][0]} failed!') + validated = False + + if validated: + print('[+] Validated checksums of all model files!') \ No newline at end of file From d550c12a3eda0aeceacc8013f9f77808d9f524be Mon Sep 17 00:00:00 2001 From: Nikita Skakun Date: Thu, 30 Mar 2023 12:52:16 -0700 Subject: [PATCH 4/8] Fixed the bug with additional bytes. The issue seems to be with huggingface not reporting the entire size of the model. Added an error message with instructions if the checksums don't match. --- download-model.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/download-model.py b/download-model.py index 0b3f16d7..52cdae81 100644 --- a/download-model.py +++ b/download-model.py @@ -35,7 +35,7 @@ def get_file(url, output_folder): # Check if the file has already been downloaded completely r = requests.head(url) total_size = int(r.headers.get('content-length', 0)) - if output_path.stat().st_size == total_size: + if output_path.stat().st_size >= total_size: return # Otherwise, resume the download from where it left off headers = {'Range': f'bytes={output_path.stat().st_size}-'} @@ -215,7 +215,6 @@ if __name__ == '__main__': print(f"Downloading the model to {output_folder}") download_files(links, output_folder, args.threads) - print('\n') # Validate the checksums validated = True for i in range(len(sha256)): @@ -227,4 +226,6 @@ if __name__ == '__main__': validated = False if validated: - print('[+] Validated checksums of all model files!') \ No newline at end of file + print('[+] Validated checksums of all model files!') + else: + print('[-] Rerun the download-model.py with --clean flag') \ No newline at end of file From 0cc89e7755512242784fc4ed5a8c0c64d6bf689e Mon Sep 17 00:00:00 2001 From: Nikita Skakun Date: Thu, 30 Mar 2023 20:06:12 -0700 Subject: [PATCH 5/8] Checksum code now activated by --check flag. --- download-model.py | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/download-model.py b/download-model.py index 52cdae81..b2788a60 100644 --- a/download-model.py +++ b/download-model.py @@ -26,6 +26,7 @@ parser.add_argument('--threads', type=int, default=1, help='Number of files to d parser.add_argument('--text-only', action='store_true', help='Only download text files (txt/json).') parser.add_argument('--output', type=str, default=None, help='The folder where the model should be saved.') parser.add_argument('--clean', action='store_true', help='Does not resume the previous download.') +parser.add_argument('--check', action='store_true', help='Validates the checksums of model files.') args = parser.parse_args() def get_file(url, output_folder): @@ -215,17 +216,18 @@ if __name__ == '__main__': print(f"Downloading the model to {output_folder}") download_files(links, output_folder, args.threads) - # Validate the checksums - validated = True - for i in range(len(sha256)): - with open(output_folder / sha256[i][0], "rb") as f: - bytes = f.read() - file_hash = hashlib.sha256(bytes).hexdigest() - if file_hash != sha256[i][1]: - print(f'[!] Checksum for {sha256[i][0]} failed!') - validated = False - - if validated: - print('[+] Validated checksums of all model files!') - else: - print('[-] Rerun the download-model.py with --clean flag') \ No newline at end of file + if args.check: + # Validate the checksums + validated = True + for i in range(len(sha256)): + with open(output_folder / sha256[i][0], "rb") as f: + bytes = f.read() + file_hash = hashlib.sha256(bytes).hexdigest() + if file_hash != sha256[i][1]: + print(f'[!] Checksum for {sha256[i][0]} failed!') + validated = False + + if validated: + print('[+] Validated checksums of all model files!') + else: + print('[-] Rerun the download-model.py with --clean flag') \ No newline at end of file From 92c7068daff5655190e448705cdf498904873e93 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 31 Mar 2023 01:31:47 -0300 Subject: [PATCH 6/8] Don't download if --check is specified --- download-model.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/download-model.py b/download-model.py index b2788a60..8e3f2e8b 100644 --- a/download-model.py +++ b/download-model.py @@ -9,6 +9,7 @@ python download-model.py facebook/opt-1.3b import argparse import base64 import datetime +import hashlib import json import re import sys @@ -17,7 +18,6 @@ from pathlib import Path import requests import tqdm from tqdm.contrib.concurrent import thread_map -import hashlib parser = argparse.ArgumentParser() parser.add_argument('MODEL', type=str, default=None, nargs='?') @@ -212,22 +212,32 @@ if __name__ == '__main__': if sha256_str != '': f.write(f'sha256sum:\n{sha256_str}') - # Downloading the files - print(f"Downloading the model to {output_folder}") - download_files(links, output_folder, args.threads) - if args.check: # Validate the checksums validated = True for i in range(len(sha256)): + fpath = (output_folder / sha256[i][0]) + + if not fpath.exists(): + print(f"The following file is missing: {fpath}") + validated = False + continue + with open(output_folder / sha256[i][0], "rb") as f: bytes = f.read() file_hash = hashlib.sha256(bytes).hexdigest() if file_hash != sha256[i][1]: - print(f'[!] Checksum for {sha256[i][0]} failed!') + print(f'Checksum failed: {sha256[i][0]} {sha256[i][1]}') validated = False + else: + print(f'Checksum validated: {sha256[i][0]} {sha256[i][1]}') if validated: print('[+] Validated checksums of all model files!') else: - print('[-] Rerun the download-model.py with --clean flag') \ No newline at end of file + print('[-] Invalid checksums. Rerun download-model.py with the --clean flag.') + + else: + # Downloading the files + print(f"Downloading the model to {output_folder}") + download_files(links, output_folder, args.threads) \ No newline at end of file From b99bea3c69d997d7d8988b2fcd5def1173e306ea Mon Sep 17 00:00:00 2001 From: Nikita Skakun Date: Thu, 30 Mar 2023 23:11:59 -0700 Subject: [PATCH 7/8] Fixed reported header affecting resuming download --- download-model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/download-model.py b/download-model.py index 8e3f2e8b..75b031bd 100644 --- a/download-model.py +++ b/download-model.py @@ -34,7 +34,7 @@ def get_file(url, output_folder): output_path = output_folder / filename if output_path.exists() and not args.clean: # Check if the file has already been downloaded completely - r = requests.head(url) + r = requests.get(url, stream=True) total_size = int(r.headers.get('content-length', 0)) if output_path.stat().st_size >= total_size: return From 74462ac713201fd6e8036fcc947c57b9c9924ebf Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 31 Mar 2023 22:52:52 -0300 Subject: [PATCH 8/8] Don't override the metadata when checking the sha256sum --- download-model.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/download-model.py b/download-model.py index 75b031bd..43473215 100644 --- a/download-model.py +++ b/download-model.py @@ -197,20 +197,7 @@ if __name__ == '__main__': output_folder = f"{'_'.join(model.split('/')[-2:])}" if branch != 'main': output_folder += f'_{branch}' - - # Creating the folder and writing the metadata output_folder = Path(base_folder) / output_folder - if not output_folder.exists(): - output_folder.mkdir() - with open(output_folder / 'huggingface-metadata.txt', 'w') as f: - f.write(f'url: https://huggingface.co/{model}\n') - f.write(f'branch: {branch}\n') - f.write(f'download date: {str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))}\n') - sha256_str = '' - for i in range(len(sha256)): - sha256_str += f' {sha256[i][1]} {sha256[i][0]}\n' - if sha256_str != '': - f.write(f'sha256sum:\n{sha256_str}') if args.check: # Validate the checksums @@ -238,6 +225,20 @@ if __name__ == '__main__': print('[-] Invalid checksums. Rerun download-model.py with the --clean flag.') else: + + # Creating the folder and writing the metadata + if not output_folder.exists(): + output_folder.mkdir() + with open(output_folder / 'huggingface-metadata.txt', 'w') as f: + f.write(f'url: https://huggingface.co/{model}\n') + f.write(f'branch: {branch}\n') + f.write(f'download date: {str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))}\n') + sha256_str = '' + for i in range(len(sha256)): + sha256_str += f' {sha256[i][1]} {sha256[i][0]}\n' + if sha256_str != '': + f.write(f'sha256sum:\n{sha256_str}') + # Downloading the files print(f"Downloading the model to {output_folder}") download_files(links, output_folder, args.threads) \ No newline at end of file