From e17af5926156c1f9f9d793b29bc360bece97a5a0 Mon Sep 17 00:00:00 2001
From: Nikita Skakun <nikita@skakun-family.com>
Date: Thu, 30 Mar 2023 00:21:34 -0700
Subject: [PATCH 1/8] Add support for resuming downloads

This commit adds the ability to resume interrupted downloads by adding a new function to the downloader module. The function uses the HTTP Range header to fetch only the remaining part of a file that wasn't downloaded yet.
---
 download-model.py | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/download-model.py b/download-model.py
index 7e5f61b2..94524f76 100644
--- a/download-model.py
+++ b/download-model.py
@@ -27,8 +27,23 @@ parser.add_argument('--output', type=str, default=None, help='The folder where t
 args = parser.parse_args()
 
 def get_file(url, output_folder):
-    r = requests.get(url, stream=True)
-    with open(output_folder / Path(url.rsplit('/', 1)[1]), 'wb') as f:
+    filename = Path(url.rsplit('/', 1)[1])
+    output_path = output_folder / filename
+    if output_path.exists():
+        # Check if the file has already been downloaded completely
+        r = requests.head(url)
+        total_size = int(r.headers.get('content-length', 0))
+        if output_path.stat().st_size == total_size:
+            return
+        # Otherwise, resume the download from where it left off
+        headers = {'Range': f'bytes={output_path.stat().st_size}-'}
+        mode = 'ab'
+    else:
+        headers = {}
+        mode = 'wb'
+
+    r = requests.get(url, stream=True, headers=headers)
+    with open(output_path, mode) as f:
         total_size = int(r.headers.get('content-length', 0))
         block_size = 1024
         with tqdm.tqdm(total=total_size, unit='iB', unit_scale=True, bar_format='{l_bar}{bar}| {n_fmt:6}/{total_fmt:6} {rate_fmt:6}') as t:
@@ -149,7 +164,7 @@ def get_download_links_from_huggingface(model, branch):
     return links, sha256, is_lora
 
 def download_files(file_list, output_folder, num_threads=8):
-    thread_map(lambda url: get_file(url, output_folder), file_list, max_workers=num_threads)
+    thread_map(lambda url: get_file(url, output_folder), file_list, max_workers=num_threads, disable=True)
 
 if __name__ == '__main__':
     model = args.MODEL

From 8c590c2362d4ee783b41a93f6c03f8e19dc40657 Mon Sep 17 00:00:00 2001
From: Nikita Skakun <nikita@skakun-family.com>
Date: Thu, 30 Mar 2023 00:42:19 -0700
Subject: [PATCH 2/8] Added a 'clean' flag to not resume download.

---
 download-model.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/download-model.py b/download-model.py
index 94524f76..ed8a047d 100644
--- a/download-model.py
+++ b/download-model.py
@@ -24,12 +24,13 @@ parser.add_argument('--branch', type=str, default='main', help='Name of the Git
 parser.add_argument('--threads', type=int, default=1, help='Number of files to download simultaneously.')
 parser.add_argument('--text-only', action='store_true', help='Only download text files (txt/json).')
 parser.add_argument('--output', type=str, default=None, help='The folder where the model should be saved.')
+parser.add_argument('--clean', action='store_true', help='Does not resume the previous download.')
 args = parser.parse_args()
 
 def get_file(url, output_folder):
     filename = Path(url.rsplit('/', 1)[1])
     output_path = output_folder / filename
-    if output_path.exists():
+    if output_path.exists() and not args.clean:
         # Check if the file has already been downloaded completely
         r = requests.head(url)
         total_size = int(r.headers.get('content-length', 0))

From 297ac051d91c52ee6e0a39bc745d84f85a305346 Mon Sep 17 00:00:00 2001
From: Nikita Skakun <nikita@skakun-family.com>
Date: Thu, 30 Mar 2023 02:34:19 -0700
Subject: [PATCH 3/8] Added sha256 validation of model files.

---
 download-model.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/download-model.py b/download-model.py
index ed8a047d..0b3f16d7 100644
--- a/download-model.py
+++ b/download-model.py
@@ -17,6 +17,7 @@ from pathlib import Path
 import requests
 import tqdm
 from tqdm.contrib.concurrent import thread_map
+import hashlib
 
 parser = argparse.ArgumentParser()
 parser.add_argument('MODEL', type=str, default=None, nargs='?')
@@ -213,4 +214,17 @@ if __name__ == '__main__':
     # Downloading the files
     print(f"Downloading the model to {output_folder}")
     download_files(links, output_folder, args.threads)
-    print()
+    
+    print('\n')
+    # Validate the checksums
+    validated = True
+    for i in range(len(sha256)):
+        with open(output_folder / sha256[i][0], "rb") as f:
+            bytes = f.read()
+            file_hash = hashlib.sha256(bytes).hexdigest()
+            if file_hash != sha256[i][1]:
+                print(f'[!] Checksum for {sha256[i][0]} failed!')
+                validated = False
+    
+    if validated:
+        print('[+] Validated checksums of all model files!')
\ No newline at end of file

From d550c12a3eda0aeceacc8013f9f77808d9f524be Mon Sep 17 00:00:00 2001
From: Nikita Skakun <nikita@skakun-family.com>
Date: Thu, 30 Mar 2023 12:52:16 -0700
Subject: [PATCH 4/8] Fixed the bug with additional bytes.

The issue seems to be with huggingface not reporting the entire size of the model.
Added an error message with instructions if the checksums don't match.
---
 download-model.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/download-model.py b/download-model.py
index 0b3f16d7..52cdae81 100644
--- a/download-model.py
+++ b/download-model.py
@@ -35,7 +35,7 @@ def get_file(url, output_folder):
         # Check if the file has already been downloaded completely
         r = requests.head(url)
         total_size = int(r.headers.get('content-length', 0))
-        if output_path.stat().st_size == total_size:
+        if output_path.stat().st_size >= total_size:
             return
         # Otherwise, resume the download from where it left off
         headers = {'Range': f'bytes={output_path.stat().st_size}-'}
@@ -215,7 +215,6 @@ if __name__ == '__main__':
     print(f"Downloading the model to {output_folder}")
     download_files(links, output_folder, args.threads)
     
-    print('\n')
     # Validate the checksums
     validated = True
     for i in range(len(sha256)):
@@ -227,4 +226,6 @@ if __name__ == '__main__':
                 validated = False
     
     if validated:
-        print('[+] Validated checksums of all model files!')
\ No newline at end of file
+        print('[+] Validated checksums of all model files!')
+    else:
+        print('[-] Rerun the download-model.py with --clean flag')
\ No newline at end of file

From 0cc89e7755512242784fc4ed5a8c0c64d6bf689e Mon Sep 17 00:00:00 2001
From: Nikita Skakun <nikita@skakun-family.com>
Date: Thu, 30 Mar 2023 20:06:12 -0700
Subject: [PATCH 5/8] Checksum code now activated by --check flag.

---
 download-model.py | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/download-model.py b/download-model.py
index 52cdae81..b2788a60 100644
--- a/download-model.py
+++ b/download-model.py
@@ -26,6 +26,7 @@ parser.add_argument('--threads', type=int, default=1, help='Number of files to d
 parser.add_argument('--text-only', action='store_true', help='Only download text files (txt/json).')
 parser.add_argument('--output', type=str, default=None, help='The folder where the model should be saved.')
 parser.add_argument('--clean', action='store_true', help='Does not resume the previous download.')
+parser.add_argument('--check', action='store_true', help='Validates the checksums of model files.')
 args = parser.parse_args()
 
 def get_file(url, output_folder):
@@ -215,17 +216,18 @@ if __name__ == '__main__':
     print(f"Downloading the model to {output_folder}")
     download_files(links, output_folder, args.threads)
     
-    # Validate the checksums
-    validated = True
-    for i in range(len(sha256)):
-        with open(output_folder / sha256[i][0], "rb") as f:
-            bytes = f.read()
-            file_hash = hashlib.sha256(bytes).hexdigest()
-            if file_hash != sha256[i][1]:
-                print(f'[!] Checksum for {sha256[i][0]} failed!')
-                validated = False
-    
-    if validated:
-        print('[+] Validated checksums of all model files!')
-    else:
-        print('[-] Rerun the download-model.py with --clean flag')
\ No newline at end of file
+    if args.check:
+        # Validate the checksums
+        validated = True
+        for i in range(len(sha256)):
+            with open(output_folder / sha256[i][0], "rb") as f:
+                bytes = f.read()
+                file_hash = hashlib.sha256(bytes).hexdigest()
+                if file_hash != sha256[i][1]:
+                    print(f'[!] Checksum for {sha256[i][0]} failed!')
+                    validated = False
+        
+        if validated:
+            print('[+] Validated checksums of all model files!')
+        else:
+            print('[-] Rerun the download-model.py with --clean flag')
\ No newline at end of file

From 92c7068daff5655190e448705cdf498904873e93 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 31 Mar 2023 01:31:47 -0300
Subject: [PATCH 6/8] Don't download if --check is specified

---
 download-model.py | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/download-model.py b/download-model.py
index b2788a60..8e3f2e8b 100644
--- a/download-model.py
+++ b/download-model.py
@@ -9,6 +9,7 @@ python download-model.py facebook/opt-1.3b
 import argparse
 import base64
 import datetime
+import hashlib
 import json
 import re
 import sys
@@ -17,7 +18,6 @@ from pathlib import Path
 import requests
 import tqdm
 from tqdm.contrib.concurrent import thread_map
-import hashlib
 
 parser = argparse.ArgumentParser()
 parser.add_argument('MODEL', type=str, default=None, nargs='?')
@@ -212,22 +212,32 @@ if __name__ == '__main__':
         if sha256_str != '':
             f.write(f'sha256sum:\n{sha256_str}')
 
-    # Downloading the files
-    print(f"Downloading the model to {output_folder}")
-    download_files(links, output_folder, args.threads)
-    
     if args.check:
         # Validate the checksums
         validated = True
         for i in range(len(sha256)):
+            fpath = (output_folder / sha256[i][0])
+
+            if not fpath.exists():
+                print(f"The following file is missing: {fpath}")
+                validated = False
+                continue
+
             with open(output_folder / sha256[i][0], "rb") as f:
                 bytes = f.read()
                 file_hash = hashlib.sha256(bytes).hexdigest()
                 if file_hash != sha256[i][1]:
-                    print(f'[!] Checksum for {sha256[i][0]} failed!')
+                    print(f'Checksum failed: {sha256[i][0]}  {sha256[i][1]}')
                     validated = False
+                else:
+                    print(f'Checksum validated: {sha256[i][0]}  {sha256[i][1]}')
         
         if validated:
             print('[+] Validated checksums of all model files!')
         else:
-            print('[-] Rerun the download-model.py with --clean flag')
\ No newline at end of file
+            print('[-] Invalid checksums. Rerun download-model.py with the --clean flag.')
+
+    else:
+        # Downloading the files
+        print(f"Downloading the model to {output_folder}")
+        download_files(links, output_folder, args.threads)
\ No newline at end of file

From b99bea3c69d997d7d8988b2fcd5def1173e306ea Mon Sep 17 00:00:00 2001
From: Nikita Skakun <nikita@skakun-family.com>
Date: Thu, 30 Mar 2023 23:11:59 -0700
Subject: [PATCH 7/8] Fixed reported header affecting resuming download

---
 download-model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/download-model.py b/download-model.py
index 8e3f2e8b..75b031bd 100644
--- a/download-model.py
+++ b/download-model.py
@@ -34,7 +34,7 @@ def get_file(url, output_folder):
     output_path = output_folder / filename
     if output_path.exists() and not args.clean:
         # Check if the file has already been downloaded completely
-        r = requests.head(url)
+        r = requests.get(url, stream=True)
         total_size = int(r.headers.get('content-length', 0))
         if output_path.stat().st_size >= total_size:
             return

From 74462ac713201fd6e8036fcc947c57b9c9924ebf Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 31 Mar 2023 22:52:52 -0300
Subject: [PATCH 8/8] Don't override the metadata when checking the sha256sum

---
 download-model.py | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/download-model.py b/download-model.py
index 75b031bd..43473215 100644
--- a/download-model.py
+++ b/download-model.py
@@ -197,20 +197,7 @@ if __name__ == '__main__':
     output_folder = f"{'_'.join(model.split('/')[-2:])}"
     if branch != 'main':
         output_folder += f'_{branch}'
-
-    # Creating the folder and writing the metadata
     output_folder = Path(base_folder) / output_folder
-    if not output_folder.exists():
-        output_folder.mkdir()
-    with open(output_folder / 'huggingface-metadata.txt', 'w') as f:
-        f.write(f'url: https://huggingface.co/{model}\n')
-        f.write(f'branch: {branch}\n')
-        f.write(f'download date: {str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))}\n')
-        sha256_str = ''
-        for i in range(len(sha256)):
-            sha256_str += f'    {sha256[i][1]} {sha256[i][0]}\n'
-        if sha256_str != '':
-            f.write(f'sha256sum:\n{sha256_str}')
 
     if args.check:
         # Validate the checksums
@@ -238,6 +225,20 @@ if __name__ == '__main__':
             print('[-] Invalid checksums. Rerun download-model.py with the --clean flag.')
 
     else:
+
+        # Creating the folder and writing the metadata
+        if not output_folder.exists():
+            output_folder.mkdir()
+        with open(output_folder / 'huggingface-metadata.txt', 'w') as f:
+            f.write(f'url: https://huggingface.co/{model}\n')
+            f.write(f'branch: {branch}\n')
+            f.write(f'download date: {str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))}\n')
+            sha256_str = ''
+            for i in range(len(sha256)):
+                sha256_str += f'    {sha256[i][1]} {sha256[i][0]}\n'
+            if sha256_str != '':
+                f.write(f'sha256sum:\n{sha256_str}')
+
         # Downloading the files
         print(f"Downloading the model to {output_folder}")
         download_files(links, output_folder, args.threads)
\ No newline at end of file