Download optimizations (#2786)

* download_model_files metadata writing improvement * line swap * reduce line length * safer download and greater block size * Minor changes by pycodestyle --------- Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com>
2024-11-25 17:29:22 +01:00 · 2023-06-20 22:14:18 -04:00 · 2023-06-20 22:14:18 -04:00 · b22c7199c9
commit b22c7199c9
parent 447569e31a
1 changed files with 28 additions and 35 deletions
--- a/download-model.py
+++ b/download-model.py
@ -77,7 +77,6 @@ class ModelDownloader:
        if os.getenv('HF_USER') is not None and os.getenv('HF_PASS') is not None:
            self.s.auth = (os.getenv('HF_USER'), os.getenv('HF_PASS'))

-
    def sanitize_model_and_branch_names(self, model, branch):
        if model[-1] == '/':
            model = model[:-1]
@ -92,7 +91,6 @@ class ModelDownloader:

        return model, branch

-
    def get_download_links_from_huggingface(self, model, branch, text_only=False):
        base = "https://huggingface.co"
        page = f"/api/models/{model}/tree/{branch}"
@ -163,7 +161,6 @@ class ModelDownloader:

        return links, sha256, is_lora

-
    def get_output_folder(self, model, branch, is_lora, base_folder=None):
        if base_folder is None:
            base_folder = 'models' if not is_lora else 'loras'
@ -174,10 +171,11 @@ class ModelDownloader:
        output_folder = Path(base_folder) / output_folder
        return output_folder

-
    def get_single_file(self, url, output_folder, start_from_scratch=False):
        filename = Path(url.rsplit('/', 1)[1])
        output_path = output_folder / filename
+        headers = {}
+        mode = 'wb'
        if output_path.exists() and not start_from_scratch:
            # Check if the file has already been downloaded completely
            r = self.s.get(url, stream=True, timeout=20)
@ -187,50 +185,45 @@ class ModelDownloader:
            # Otherwise, resume the download from where it left off
            headers = {'Range': f'bytes={output_path.stat().st_size}-'}
            mode = 'ab'
-        else:
-            headers = {}
-            mode = 'wb'

-        r = self.s.get(url, stream=True, headers=headers, timeout=20)
-        with open(output_path, mode) as f:
+        with self.s.get(url, stream=True, headers=headers, timeout=20) as r:
+            r.raise_for_status()  # Do not continue the download if the request was unsuccessful
            total_size = int(r.headers.get('content-length', 0))
-            # Every 4MB we report an update
-            block_size = 4*1024*1024
-
-            with tqdm.tqdm(total=total_size, unit='iB', unit_scale=True, bar_format='{l_bar}{bar}| {n_fmt:6}/{total_fmt:6} {rate_fmt:6}') as t:
+            block_size = 1024 * 1024  # 1MB
+            with open(output_path, mode) as f:
+                with tqdm.tqdm(total=total_size,
+                               unit='iB',
+                               unit_scale=True,
+                               bar_format='{l_bar}{bar}| {n_fmt:6}/{total_fmt:6} {rate_fmt:6}'
+                               ) as t:
                    count = 0
                    for data in r.iter_content(block_size):
                        t.update(len(data))
                        f.write(data)
                        if self.progress_bar is not None:
                            count += len(data)
-                        self.progress_bar(float(count)/float(total_size), f"Downloading {filename}")
-
+                            self.progress_bar(float(count) / float(total_size), f"Downloading {filename}")

    def start_download_threads(self, file_list, output_folder, start_from_scratch=False, threads=1):
        thread_map(lambda url: self.get_single_file(url, output_folder, start_from_scratch=start_from_scratch), file_list, max_workers=threads, disable=True)

-
-    def download_model_files(self, model, branch, links, sha256, output_folder, progress_bar = None, start_from_scratch=False, threads=1):
+    def download_model_files(self, model, branch, links, sha256, output_folder, progress_bar=None, start_from_scratch=False, threads=1):
        self.progress_bar = progress_bar
        # Creating the folder and writing the metadata
-        if not output_folder.exists():
        output_folder.mkdir(parents=True, exist_ok=True)
-        with open(output_folder / 'huggingface-metadata.txt', 'w') as f:
-            f.write(f'url: https://huggingface.co/{model}\n')
-            f.write(f'branch: {branch}\n')
-            f.write(f'download date: {str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))}\n')
-            sha256_str = ''
-            for i in range(len(sha256)):
-                sha256_str += f'    {sha256[i][1]} {sha256[i][0]}\n'
-            if sha256_str != '':
-                f.write(f'sha256sum:\n{sha256_str}')
+        metadata = f'url: https://huggingface.co/{model}\n' \
+                   f'branch: {branch}\n' \
+                   f'download date: {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n'
+        sha256_str = '\n'.join([f'    {item[1]} {item[0]}' for item in sha256])
+        if sha256_str:
+            metadata += f'sha256sum:\n{sha256_str}'
+        metadata += '\n'
+        (output_folder / 'huggingface-metadata.txt').write_text(metadata)

        # Downloading the files
        print(f"Downloading the model to {output_folder}")
        self.start_download_threads(links, output_folder, start_from_scratch=start_from_scratch, threads=threads)

-
    def check_model_files(self, model, branch, links, sha256, output_folder):
        # Validate the checksums
        validated = True