From e1cd6cc410a8cc7804de8abff5dbfaf72b0c9e5f Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 20 Jun 2023 00:46:18 -0300 Subject: [PATCH 01/15] Minor style change --- css/html_instruct_style.css | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/css/html_instruct_style.css b/css/html_instruct_style.css index 132d4dd4..575281b1 100644 --- a/css/html_instruct_style.css +++ b/css/html_instruct_style.css @@ -17,6 +17,10 @@ margin-bottom: 1.25em !important; } +.message-body ul, .message-body ol { + margin-bottom: 1.25em !important; +} + .dark .message-body p em { color: rgb(198, 202, 214) !important; } From ce86f726e9a541adcf78159f1ce5635065d0e404 Mon Sep 17 00:00:00 2001 From: FartyPants Date: Mon, 19 Jun 2023 23:47:36 -0400 Subject: [PATCH 02/15] Added saving of training logs to training_log.json (#2769) --- modules/training.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/modules/training.py b/modules/training.py index 75ba82ca..d039e807 100644 --- a/modules/training.py +++ b/modules/training.py @@ -11,7 +11,7 @@ import gradio as gr import torch import transformers from datasets import Dataset, load_dataset -from peft import (LoraConfig, get_peft_model, prepare_model_for_int8_training, +from peft import (LoraConfig, get_peft_model, prepare_model_for_kbit_training, set_peft_model_state_dict) from modules import shared, ui, utils @@ -38,6 +38,7 @@ except: "GPTNeoXForCausalLM": "gpt_neox" } +train_log = {} WANT_INTERRUPT = False PARAMETERS = ["lora_name", "always_override", "save_steps", "micro_batch_size", "batch_size", "epochs", "learning_rate", "lr_scheduler_type", "lora_rank", "lora_alpha", "lora_dropout", "cutoff_len", "dataset", "eval_dataset", "format", "eval_steps", "raw_text_file", "overlap_len", "newline_favor_len", "higher_rank_limit", "warmup_steps", "optimizer", "hard_cut_string", "train_only_after"] @@ -357,7 +358,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch # == Start prepping the model itself == if not hasattr(shared.model, 'lm_head') or hasattr(shared.model.lm_head, 'weight'): logger.info("Getting model ready...") - prepare_model_for_int8_training(shared.model) + prepare_model_for_kbit_training(shared.model) logger.info("Prepping for training...") config = LoraConfig( @@ -406,12 +407,19 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch control.should_training_stop = True elif state.global_step > 0 and actual_save_steps > 0 and state.global_step % actual_save_steps == 0: lora_model.save_pretrained(f"{lora_file_path}/checkpoint-{tracked.current_steps}/") + # Save log + with open(f"{lora_file_path}/checkpoint-{tracked.current_steps}/training_log.json", 'w', encoding='utf-8') as file: + json.dump(train_log, file, indent=2) + def on_substep_end(self, args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs): tracked.current_steps += 1 if WANT_INTERRUPT: control.should_epoch_stop = True control.should_training_stop = True + + def on_log(self, args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, logs, **kwargs): + train_log.update(logs) trainer = transformers.Trainer( model=lora_model, @@ -448,7 +456,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch # == Save parameters for reuse == with open(f"{lora_file_path}/training_parameters.json", 'w', encoding='utf-8') as file: vars = locals() - json.dump({x: vars[x] for x in PARAMETERS}, file) + json.dump({x: vars[x] for x in PARAMETERS}, file, indent=2) # == Main run and monitor loop == logger.info("Starting training...") @@ -462,7 +470,9 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch # Note: save in the thread in case the gradio thread breaks (eg browser closed) lora_model.save_pretrained(lora_file_path) logger.info("LoRA training run is completed and saved.") - tracked.did_save = True + # Save log + with open(f"{lora_file_path}/training_log.json", 'w', encoding='utf-8') as file: + json.dump(train_log, file, indent=2) thread = threading.Thread(target=threaded_run) thread.start() From c623e142acb651932d905e7e4dc5ce612f9b0fb4 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 20 Jun 2023 00:49:38 -0300 Subject: [PATCH 03/15] Bump llama-cpp-python --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index b1456490..b4ceb313 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,7 +19,7 @@ transformers==4.30.2 git+https://github.com/huggingface/peft@03eb378eb914fbee709ff7c86ba5b1d033b89524 bitsandbytes==0.39.0; platform_system != "Windows" https://github.com/jllllll/bitsandbytes-windows-webui/raw/main/bitsandbytes-0.39.0-py3-none-any.whl; platform_system == "Windows" -llama-cpp-python==0.1.62; platform_system != "Windows" -https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.62/llama_cpp_python-0.1.62-cp310-cp310-win_amd64.whl; platform_system == "Windows" +llama-cpp-python==0.1.64; platform_system != "Windows" +https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.64/llama_cpp_python-0.1.64-cp310-cp310-win_amd64.whl; platform_system == "Windows" https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.2.2/auto_gptq-0.2.2+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows" https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.2.2/auto_gptq-0.2.2+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" From c40932eb39140e039a8a3574497ca94567188ddb Mon Sep 17 00:00:00 2001 From: MikoAL <66015876+MikoAL@users.noreply.github.com> Date: Tue, 20 Jun 2023 12:03:44 +0800 Subject: [PATCH 04/15] Added Falcon LoRA training support (#2684) I am 50% sure this will work --- modules/training.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/modules/training.py b/modules/training.py index d039e807..65f1668a 100644 --- a/modules/training.py +++ b/modules/training.py @@ -30,12 +30,14 @@ try: MODEL_CLASSES = {v: k for k, v in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES} except: standard_modules = ["q_proj", "v_proj"] - model_to_lora_modules = {"llama": standard_modules, "opt": standard_modules, "gptj": standard_modules, "gpt_neox": ["query_key_value"]} + model_to_lora_modules = {"llama": standard_modules, "opt": standard_modules, "gptj": standard_modules, "gpt_neox": ["query_key_value"], "rw":["query_key_value"]} MODEL_CLASSES = { "LlamaForCausalLM": "llama", "OPTForCausalLM": "opt", "GPTJForCausalLM": "gptj", - "GPTNeoXForCausalLM": "gpt_neox" + "GPTNeoXForCausalLM": "gpt_neox", + "RWForCausalLM": "rw" + } train_log = {} From 7625c6de892a2614ad2d2a8486eea224b2ce3dbd Mon Sep 17 00:00:00 2001 From: EugeoSynthesisThirtyTwo Date: Tue, 20 Jun 2023 21:18:42 +0200 Subject: [PATCH 05/15] fix usage of self in classmethod (#2781) --- modules/llamacpp_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py index 9f6122d9..10a852db 100644 --- a/modules/llamacpp_model.py +++ b/modules/llamacpp_model.py @@ -52,9 +52,9 @@ class LlamaCppModel: 'n_gpu_layers': shared.args.n_gpu_layers } - self.model = Llama(**params) + result.model = Llama(**params) if cache_capacity > 0: - self.model.set_cache(LlamaCache(capacity_bytes=cache_capacity)) + result.model.set_cache(LlamaCache(capacity_bytes=cache_capacity)) # This is ugly, but the model and the tokenizer are the same object in this library. return result, result From 0d0d849478199c3fc8746df50cc5d656eef63688 Mon Sep 17 00:00:00 2001 From: ramblingcoder Date: Tue, 20 Jun 2023 16:31:28 -0500 Subject: [PATCH 06/15] Update Dockerfile to resolve superbooga requirement error (#2401) --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index be6fcfc2..7cc0ff15 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -26,7 +26,7 @@ LABEL maintainer="Your Name " LABEL description="Docker image for GPTQ-for-LLaMa and Text Generation WebUI" RUN apt-get update && \ - apt-get install --no-install-recommends -y libportaudio2 libasound-dev git python3 python3-pip make g++ && \ + apt-get install --no-install-recommends -y python3-dev libportaudio2 libasound-dev git python3 python3-pip make g++ && \ rm -rf /var/lib/apt/lists/* RUN --mount=type=cache,target=/root/.cache/pip pip3 install virtualenv From 447569e31ab78c0933850b63e8fe2257faf64180 Mon Sep 17 00:00:00 2001 From: Morgan Schweers Date: Tue, 20 Jun 2023 18:59:14 -0700 Subject: [PATCH 07/15] Add a download progress bar to the web UI. (#2472) * Show download progress on the model screen. * In case of error, mark as done to clear progress bar. * Increase the iteration block size to reduce overhead. --- download-model.py | 11 +++++++++-- server.py | 10 +++++++--- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/download-model.py b/download-model.py index 540f94c6..e2a951cb 100644 --- a/download-model.py +++ b/download-model.py @@ -194,18 +194,25 @@ class ModelDownloader: r = self.s.get(url, stream=True, headers=headers, timeout=20) with open(output_path, mode) as f: total_size = int(r.headers.get('content-length', 0)) - block_size = 1024 + # Every 4MB we report an update + block_size = 4*1024*1024 + with tqdm.tqdm(total=total_size, unit='iB', unit_scale=True, bar_format='{l_bar}{bar}| {n_fmt:6}/{total_fmt:6} {rate_fmt:6}') as t: + count = 0 for data in r.iter_content(block_size): t.update(len(data)) f.write(data) + if self.progress_bar is not None: + count += len(data) + self.progress_bar(float(count)/float(total_size), f"Downloading {filename}") def start_download_threads(self, file_list, output_folder, start_from_scratch=False, threads=1): thread_map(lambda url: self.get_single_file(url, output_folder, start_from_scratch=start_from_scratch), file_list, max_workers=threads, disable=True) - def download_model_files(self, model, branch, links, sha256, output_folder, start_from_scratch=False, threads=1): + def download_model_files(self, model, branch, links, sha256, output_folder, progress_bar = None, start_from_scratch=False, threads=1): + self.progress_bar = progress_bar # Creating the folder and writing the metadata if not output_folder.exists(): output_folder.mkdir(parents=True, exist_ok=True) diff --git a/server.py b/server.py index 4198dd69..b6699f14 100644 --- a/server.py +++ b/server.py @@ -122,7 +122,7 @@ def count_tokens(text): return 'Couldn\'t count the number of tokens. Is a tokenizer loaded?' -def download_model_wrapper(repo_id): +def download_model_wrapper(repo_id, progress=gr.Progress()): try: downloader_module = importlib.import_module("download-model") downloader = downloader_module.ModelDownloader() @@ -131,6 +131,7 @@ def download_model_wrapper(repo_id): branch = repo_id_parts[1] if len(repo_id_parts) > 1 else "main" check = False + progress(0.0) yield ("Cleaning up the model/branch names") model, branch = downloader.sanitize_model_and_branch_names(model, branch) @@ -141,13 +142,16 @@ def download_model_wrapper(repo_id): output_folder = downloader.get_output_folder(model, branch, is_lora) if check: + progress(0.5) yield ("Checking previously downloaded files") downloader.check_model_files(model, branch, links, sha256, output_folder) + progress(1.0) else: yield (f"Downloading files to {output_folder}") - downloader.download_model_files(model, branch, links, sha256, output_folder, threads=1) + downloader.download_model_files(model, branch, links, sha256, output_folder, progress_bar=progress, threads=1) yield ("Done!") except: + progress(1.0) yield traceback.format_exc() @@ -276,7 +280,7 @@ def create_model_menus(): save_model_settings, [shared.gradio[k] for k in ['model_menu', 'interface_state']], shared.gradio['model_status'], show_progress=False) shared.gradio['lora_menu_apply'].click(load_lora_wrapper, shared.gradio['lora_menu'], shared.gradio['model_status'], show_progress=False) - shared.gradio['download_model_button'].click(download_model_wrapper, shared.gradio['custom_model_menu'], shared.gradio['model_status'], show_progress=False) + shared.gradio['download_model_button'].click(download_model_wrapper, shared.gradio['custom_model_menu'], shared.gradio['model_status'], show_progress=True) shared.gradio['autoload_model'].change(lambda x: gr.update(visible=not x), shared.gradio['autoload_model'], load) From b22c7199c9ad7b661ed5953c5ca665d2b1d07b5e Mon Sep 17 00:00:00 2001 From: Peter Sofronas Date: Tue, 20 Jun 2023 22:14:18 -0400 Subject: [PATCH 08/15] Download optimizations (#2786) * download_model_files metadata writing improvement * line swap * reduce line length * safer download and greater block size * Minor changes by pycodestyle --------- Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com> --- download-model.py | 63 +++++++++++++++++++++-------------------------- 1 file changed, 28 insertions(+), 35 deletions(-) diff --git a/download-model.py b/download-model.py index e2a951cb..993792e9 100644 --- a/download-model.py +++ b/download-model.py @@ -77,7 +77,6 @@ class ModelDownloader: if os.getenv('HF_USER') is not None and os.getenv('HF_PASS') is not None: self.s.auth = (os.getenv('HF_USER'), os.getenv('HF_PASS')) - def sanitize_model_and_branch_names(self, model, branch): if model[-1] == '/': model = model[:-1] @@ -92,7 +91,6 @@ class ModelDownloader: return model, branch - def get_download_links_from_huggingface(self, model, branch, text_only=False): base = "https://huggingface.co" page = f"/api/models/{model}/tree/{branch}" @@ -163,7 +161,6 @@ class ModelDownloader: return links, sha256, is_lora - def get_output_folder(self, model, branch, is_lora, base_folder=None): if base_folder is None: base_folder = 'models' if not is_lora else 'loras' @@ -174,10 +171,11 @@ class ModelDownloader: output_folder = Path(base_folder) / output_folder return output_folder - def get_single_file(self, url, output_folder, start_from_scratch=False): filename = Path(url.rsplit('/', 1)[1]) output_path = output_folder / filename + headers = {} + mode = 'wb' if output_path.exists() and not start_from_scratch: # Check if the file has already been downloaded completely r = self.s.get(url, stream=True, timeout=20) @@ -187,50 +185,45 @@ class ModelDownloader: # Otherwise, resume the download from where it left off headers = {'Range': f'bytes={output_path.stat().st_size}-'} mode = 'ab' - else: - headers = {} - mode = 'wb' - r = self.s.get(url, stream=True, headers=headers, timeout=20) - with open(output_path, mode) as f: + with self.s.get(url, stream=True, headers=headers, timeout=20) as r: + r.raise_for_status() # Do not continue the download if the request was unsuccessful total_size = int(r.headers.get('content-length', 0)) - # Every 4MB we report an update - block_size = 4*1024*1024 - - with tqdm.tqdm(total=total_size, unit='iB', unit_scale=True, bar_format='{l_bar}{bar}| {n_fmt:6}/{total_fmt:6} {rate_fmt:6}') as t: - count = 0 - for data in r.iter_content(block_size): - t.update(len(data)) - f.write(data) - if self.progress_bar is not None: - count += len(data) - self.progress_bar(float(count)/float(total_size), f"Downloading {filename}") - + block_size = 1024 * 1024 # 1MB + with open(output_path, mode) as f: + with tqdm.tqdm(total=total_size, + unit='iB', + unit_scale=True, + bar_format='{l_bar}{bar}| {n_fmt:6}/{total_fmt:6} {rate_fmt:6}' + ) as t: + count = 0 + for data in r.iter_content(block_size): + t.update(len(data)) + f.write(data) + if self.progress_bar is not None: + count += len(data) + self.progress_bar(float(count) / float(total_size), f"Downloading {filename}") def start_download_threads(self, file_list, output_folder, start_from_scratch=False, threads=1): thread_map(lambda url: self.get_single_file(url, output_folder, start_from_scratch=start_from_scratch), file_list, max_workers=threads, disable=True) - - def download_model_files(self, model, branch, links, sha256, output_folder, progress_bar = None, start_from_scratch=False, threads=1): + def download_model_files(self, model, branch, links, sha256, output_folder, progress_bar=None, start_from_scratch=False, threads=1): self.progress_bar = progress_bar # Creating the folder and writing the metadata - if not output_folder.exists(): - output_folder.mkdir(parents=True, exist_ok=True) - with open(output_folder / 'huggingface-metadata.txt', 'w') as f: - f.write(f'url: https://huggingface.co/{model}\n') - f.write(f'branch: {branch}\n') - f.write(f'download date: {str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))}\n') - sha256_str = '' - for i in range(len(sha256)): - sha256_str += f' {sha256[i][1]} {sha256[i][0]}\n' - if sha256_str != '': - f.write(f'sha256sum:\n{sha256_str}') + output_folder.mkdir(parents=True, exist_ok=True) + metadata = f'url: https://huggingface.co/{model}\n' \ + f'branch: {branch}\n' \ + f'download date: {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n' + sha256_str = '\n'.join([f' {item[1]} {item[0]}' for item in sha256]) + if sha256_str: + metadata += f'sha256sum:\n{sha256_str}' + metadata += '\n' + (output_folder / 'huggingface-metadata.txt').write_text(metadata) # Downloading the files print(f"Downloading the model to {output_folder}") self.start_download_threads(links, output_folder, start_from_scratch=start_from_scratch, threads=threads) - def check_model_files(self, model, branch, links, sha256, output_folder): # Validate the checksums validated = True From faa92eee8d028c945d49c518b1a2163661bd3aab Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 20 Jun 2023 23:25:58 -0300 Subject: [PATCH 09/15] Add spaces --- download-model.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/download-model.py b/download-model.py index 993792e9..aa9e088c 100644 --- a/download-model.py +++ b/download-model.py @@ -168,6 +168,7 @@ class ModelDownloader: output_folder = f"{'_'.join(model.split('/')[-2:])}" if branch != 'main': output_folder += f'_{branch}' + output_folder = Path(base_folder) / output_folder return output_folder @@ -177,11 +178,13 @@ class ModelDownloader: headers = {} mode = 'wb' if output_path.exists() and not start_from_scratch: + # Check if the file has already been downloaded completely r = self.s.get(url, stream=True, timeout=20) total_size = int(r.headers.get('content-length', 0)) if output_path.stat().st_size >= total_size: return + # Otherwise, resume the download from where it left off headers = {'Range': f'bytes={output_path.stat().st_size}-'} mode = 'ab' @@ -209,14 +212,17 @@ class ModelDownloader: def download_model_files(self, model, branch, links, sha256, output_folder, progress_bar=None, start_from_scratch=False, threads=1): self.progress_bar = progress_bar + # Creating the folder and writing the metadata output_folder.mkdir(parents=True, exist_ok=True) metadata = f'url: https://huggingface.co/{model}\n' \ f'branch: {branch}\n' \ f'download date: {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n' + sha256_str = '\n'.join([f' {item[1]} {item[0]}' for item in sha256]) if sha256_str: metadata += f'sha256sum:\n{sha256_str}' + metadata += '\n' (output_folder / 'huggingface-metadata.txt').write_text(metadata) From 5dfe0bec06297e510c95f7cd950362425c57b76f Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 20 Jun 2023 23:36:56 -0300 Subject: [PATCH 10/15] Remove old/useless code --- download-model.py | 62 +++-------------------------------------------- 1 file changed, 3 insertions(+), 59 deletions(-) diff --git a/download-model.py b/download-model.py index aa9e088c..7aea77be 100644 --- a/download-model.py +++ b/download-model.py @@ -1,5 +1,5 @@ ''' -Downloads models from Hugging Face to models/model-name. +Downloads models from Hugging Face to models/username_modelname. Example: python download-model.py facebook/opt-1.3b @@ -11,8 +11,8 @@ import base64 import datetime import hashlib import json -import re import os +import re import sys from pathlib import Path @@ -21,56 +21,6 @@ import tqdm from tqdm.contrib.concurrent import thread_map -def select_model_from_default_options(): - models = { - "OPT 6.7B": ("facebook", "opt-6.7b", "main"), - "OPT 2.7B": ("facebook", "opt-2.7b", "main"), - "OPT 1.3B": ("facebook", "opt-1.3b", "main"), - "OPT 350M": ("facebook", "opt-350m", "main"), - "GALACTICA 6.7B": ("facebook", "galactica-6.7b", "main"), - "GALACTICA 1.3B": ("facebook", "galactica-1.3b", "main"), - "GALACTICA 125M": ("facebook", "galactica-125m", "main"), - "Pythia-6.9B-deduped": ("EleutherAI", "pythia-6.9b-deduped", "main"), - "Pythia-2.8B-deduped": ("EleutherAI", "pythia-2.8b-deduped", "main"), - "Pythia-1.4B-deduped": ("EleutherAI", "pythia-1.4b-deduped", "main"), - "Pythia-410M-deduped": ("EleutherAI", "pythia-410m-deduped", "main"), - } - - choices = {} - print("Select the model that you want to download:\n") - for i, name in enumerate(models): - char = chr(ord('A') + i) - choices[char] = name - print(f"{char}) {name}") - - char_hugging = chr(ord('A') + len(models)) - print(f"{char_hugging}) Manually specify a Hugging Face model") - char_exit = chr(ord('A') + len(models) + 1) - print(f"{char_exit}) Do not download a model") - print() - print("Input> ", end='') - choice = input()[0].strip().upper() - if choice == char_exit: - exit() - elif choice == char_hugging: - print("""\nType the name of your desired Hugging Face model in the format organization/name. - -Examples: -facebook/opt-1.3b -EleutherAI/pythia-1.4b-deduped -""") - - print("Input> ", end='') - model = input() - branch = "main" - else: - arr = models[choices[choice]] - model = f"{arr[0]}/{arr[1]}" - branch = arr[2] - - return model, branch - - class ModelDownloader: def __init__(self): self.s = requests.Session() @@ -194,11 +144,7 @@ class ModelDownloader: total_size = int(r.headers.get('content-length', 0)) block_size = 1024 * 1024 # 1MB with open(output_path, mode) as f: - with tqdm.tqdm(total=total_size, - unit='iB', - unit_scale=True, - bar_format='{l_bar}{bar}| {n_fmt:6}/{total_fmt:6} {rate_fmt:6}' - ) as t: + with tqdm.tqdm(total=total_size, unit='iB', unit_scale=True, bar_format='{l_bar}{bar}| {n_fmt:6}/{total_fmt:6} {rate_fmt:6}') as t: count = 0 for data in r.iter_content(block_size): t.update(len(data)) @@ -270,8 +216,6 @@ if __name__ == '__main__': branch = args.branch model = args.MODEL - if model is None: - model, branch = select_model_from_default_options() downloader = ModelDownloader() # Cleaning up the model/branch names From 2661c9899aa2404e66ea642e86a0fd081ce676a9 Mon Sep 17 00:00:00 2001 From: missionfloyd Date: Wed, 21 Jun 2023 07:39:58 -0600 Subject: [PATCH 11/15] Format chat for printing (#2793) --- css/chat.css | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/css/chat.css b/css/chat.css index f8cb765a..fcf19ee0 100644 --- a/css/chat.css +++ b/css/chat.css @@ -93,3 +93,22 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { .message-body :not(pre) > code { white-space: normal !important; } + +@media print { + body { + visibility: hidden; + } + + .chat { + visibility: visible; + position: absolute; + left: 0; + top: 0; + max-width: none; + max-height: none; + width: 100%; + height: fit-content; + display: flex; + flex-direction: column-reverse; + } +} From 90be1d9fe187f653c058b5ad89b8207272d2cb19 Mon Sep 17 00:00:00 2001 From: matatonic <73265741+matatonic@users.noreply.github.com> Date: Wed, 21 Jun 2023 11:30:44 -0400 Subject: [PATCH 12/15] More models (match more) & templates (starchat-beta, tulu) (#2790) --- .../instruction-following/Starchat-Beta.yaml | 4 +++ characters/instruction-following/Tulu.yaml | 4 +++ models/config.yaml | 34 ++++++++++++++++--- 3 files changed, 37 insertions(+), 5 deletions(-) create mode 100644 characters/instruction-following/Starchat-Beta.yaml create mode 100644 characters/instruction-following/Tulu.yaml diff --git a/characters/instruction-following/Starchat-Beta.yaml b/characters/instruction-following/Starchat-Beta.yaml new file mode 100644 index 00000000..2af4ee6b --- /dev/null +++ b/characters/instruction-following/Starchat-Beta.yaml @@ -0,0 +1,4 @@ +user: "<|user|>" +bot: "<|assistant|>" +context: "<|system|>\n<|end|>\n" +turn_template: "<|user|>\n<|user-message|><|end|>\n<|bot|>\n<|bot-message|><|end|>\n" diff --git a/characters/instruction-following/Tulu.yaml b/characters/instruction-following/Tulu.yaml new file mode 100644 index 00000000..13dd14f9 --- /dev/null +++ b/characters/instruction-following/Tulu.yaml @@ -0,0 +1,4 @@ +user: "<|user|>" +bot: "<|assistant|>" +context: "" +turn_template: "<|user|>\n<|user-message|>\n<|bot|>\n<|bot-message|>\n" diff --git a/models/config.yaml b/models/config.yaml index 715bbf71..318b0822 100644 --- a/models/config.yaml +++ b/models/config.yaml @@ -50,7 +50,7 @@ llama-65b-gptq-3bit: .*vicuna.*v0: mode: 'instruct' instruction_template: 'Vicuna-v0' -.*vicuna.*(1.1|1_1): +.*vicuna.*(1.1|1_1|1.3|1_3): mode: 'instruct' instruction_template: 'Vicuna-v1.1' .*wizard.*vicuna: @@ -184,7 +184,7 @@ llama-65b-gptq-3bit: .*Nous-Hermes-13b: mode: 'instruct' instruction_template: 'Alpaca' -.*airoboros-13b-gpt4: +.*airoboros: mode: 'instruct' instruction_template: 'Vicuna-v1.1' .*WizardLM-30B-V1.0: @@ -193,7 +193,7 @@ llama-65b-gptq-3bit: TheBloke_WizardLM-30B-GPTQ: mode: 'instruct' instruction_template: 'Vicuna-v1.1' -.*(A|a)lpa(cino|sta): +.*alpa(cino|sta): mode: 'instruct' instruction_template: 'Alpaca' .*hippogriff: @@ -202,9 +202,33 @@ TheBloke_WizardLM-30B-GPTQ: .*gpt4all-.*-snoozy: mode: 'instruct' instruction_template: 'WizardLM' -.*(L|l)azarus: +.*lazarus: mode: 'instruct' instruction_template: 'Alpaca' -.*(G|g)uanaco-.*(7|13|33|65)(b|B): +.*guanaco-.*(7|13|33|65)b: mode: 'instruct' instruction_template: 'Guanaco' +.*hypermantis: + mode: 'instruct' + instruction_template: 'Alpaca' +.*open-llama-.*-open-instruct: + mode: 'instruct' + instruction_template: 'Alpaca' +.*starcoder-gpteacher-code-instruct: + mode: 'instruct' + instruction_template: 'Alpaca' +.*tulu: + mode: 'instruct' + instruction_template: 'Tulu' +.*chronos: + mode: 'instruct' + instruction_template: 'Alpaca' +.*samantha: + mode: 'instruct' + instruction_template: 'Samantha' +.*wizardcoder: + mode: 'instruct' + instruction_template: 'Alpaca' +.*starchat-beta: + mode: 'instruct' + instruction_template: 'Starchat-Beta' \ No newline at end of file From 89fb6f9236849a66920e20716240343edc0f3d9d Mon Sep 17 00:00:00 2001 From: Gaurav Bhagchandani Date: Wed, 21 Jun 2023 11:31:50 -0400 Subject: [PATCH 13/15] Fixed the ZeroDivisionError when downloading a model (#2797) --- download-model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/download-model.py b/download-model.py index 7aea77be..e04aedc3 100644 --- a/download-model.py +++ b/download-model.py @@ -149,7 +149,7 @@ class ModelDownloader: for data in r.iter_content(block_size): t.update(len(data)) f.write(data) - if self.progress_bar is not None: + if total_size != 0 and self.progress_bar is not None: count += len(data) self.progress_bar(float(count) / float(total_size), f"Downloading {filename}") From a06acd6d09fc6b39241a044d9a44b2e44c90f5a5 Mon Sep 17 00:00:00 2001 From: jllllll <3887729+jllllll@users.noreply.github.com> Date: Wed, 21 Jun 2023 13:04:45 -0500 Subject: [PATCH 14/15] Update bitsandbytes to 0.39.1 (#2799) --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index b4ceb313..4a31685b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,8 +17,8 @@ tqdm scipy transformers==4.30.2 git+https://github.com/huggingface/peft@03eb378eb914fbee709ff7c86ba5b1d033b89524 -bitsandbytes==0.39.0; platform_system != "Windows" -https://github.com/jllllll/bitsandbytes-windows-webui/raw/main/bitsandbytes-0.39.0-py3-none-any.whl; platform_system == "Windows" +bitsandbytes==0.39.1; platform_system != "Windows" +https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.39.1-py3-none-win_amd64.whl; platform_system == "Windows" llama-cpp-python==0.1.64; platform_system != "Windows" https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.64/llama_cpp_python-0.1.64-cp310-cp310-win_amd64.whl; platform_system == "Windows" https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.2.2/auto_gptq-0.2.2+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows" From 580c1ee7483e9934b7884ff0071c1784060ba6eb Mon Sep 17 00:00:00 2001 From: LarryVRH Date: Thu, 22 Jun 2023 02:31:42 +0800 Subject: [PATCH 15/15] Implement a demo HF wrapper for exllama to utilize existing HF transformers decoding. (#2777) --- README.md | 2 +- modules/exllama_hf.py | 82 ++++++++++++++++++++++++++++++++++++++ modules/loaders.py | 4 ++ modules/models.py | 9 ++++- modules/shared.py | 4 +- modules/text_generation.py | 3 +- server.py | 3 +- 7 files changed, 101 insertions(+), 6 deletions(-) create mode 100644 modules/exllama_hf.py diff --git a/README.md b/README.md index 9d2e1b00..24c04711 100644 --- a/README.md +++ b/README.md @@ -212,7 +212,7 @@ Optionally, you can use the following command-line flags: | Flag | Description | |--------------------------------------------|-------------| -| `--loader LOADER` | Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, llamacpp, rwkv, flexgen | +| `--loader LOADER` | Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, exllama_hf, llamacpp, rwkv, flexgen | #### Accelerate/transformers diff --git a/modules/exllama_hf.py b/modules/exllama_hf.py new file mode 100644 index 00000000..27cac374 --- /dev/null +++ b/modules/exllama_hf.py @@ -0,0 +1,82 @@ +import os +import sys +from pathlib import Path +from typing import * + +import torch +from transformers import ( + GenerationConfig, + LlamaTokenizer, + PretrainedConfig, + PreTrainedModel +) +from transformers.modeling_outputs import CausalLMOutputWithPast + +from modules import shared +from modules.logging_colors import logger +from modules.relative_imports import RelativeImport + +with RelativeImport("repositories/exllama"): + from model import ExLlama, ExLlamaCache, ExLlamaConfig + + +class ExllamaHF(PreTrainedModel): + def __init__(self, config: ExLlamaConfig): + super().__init__(PretrainedConfig()) + self.ex_config = config + self.ex_model = ExLlama(self.ex_config) + self.generation_config = GenerationConfig() + + def _validate_model_class(self): + pass + + def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]): + pass + + def prepare_inputs_for_generation(self, input_ids, **kwargs): + return {'input_ids': input_ids, **kwargs} + + @property + def device(self) -> torch.device: + # TODO: May cause problem on multi-gpu inference? + return torch.device(0) + + def __call__(self, *args, **kwargs): + # TODO: Some decoding methods (such as Contrastive Search) may not work at this time + assert len(args) == 0, 'no *args should be passed to forward' + use_cache = kwargs['use_cache'] + seq = kwargs['input_ids'][0].tolist() + cache = kwargs['past_key_values'] if 'past_key_values' in kwargs else None + if cache is None: + cache = ExLlamaCache(self.ex_model) + self.ex_model.forward(torch.tensor([seq[:-1]], dtype=torch.long), cache, preprocess_only=True) + logits = self.ex_model.forward(torch.tensor([seq[-1:]], dtype=torch.long), cache).to(self.device) + return CausalLMOutputWithPast(logits=logits, past_key_values=cache if use_cache else None) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs): + assert len(model_args) == 0 and len(kwargs) == 0, "extra args is currently not supported" + if isinstance(pretrained_model_name_or_path, str): + pretrained_model_name_or_path = Path(pretrained_model_name_or_path) + + pretrained_model_name_or_path = Path(f'{shared.args.model_dir}') / Path(pretrained_model_name_or_path) + config = ExLlamaConfig(pretrained_model_name_or_path / 'config.json') + + # from 'oobabooga/text-generation-webui/modules/exllama.py' + weight_path = None + for ext in ['.safetensors', '.pt', '.bin']: + found = list(pretrained_model_name_or_path.glob(f"*{ext}")) + if len(found) > 0: + weight_path = found[-1] + break + assert weight_path is not None, f'could not find weight in "{pretrained_model_name_or_path}"' + + config.model_path = str(weight_path) + + # This slowes down a bit but align better with autogptq generation. + # TODO: Should give user choice to tune the exllama config + config.act_order = True + config.fused_attn = False + config.fused_mlp_thd = 0 + + return ExllamaHF(config) \ No newline at end of file diff --git a/modules/loaders.py b/modules/loaders.py index ac6f80bd..21642023 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -55,6 +55,10 @@ loaders_and_params = { 'ExLlama' : [ 'gpu_split', 'exllama_info', + ], + 'ExLlama_HF' : [ + 'gpu_split', + 'exllama_HF_info', ] } diff --git a/modules/models.py b/modules/models.py index 1aba66c5..574e1646 100644 --- a/modules/models.py +++ b/modules/models.py @@ -49,7 +49,8 @@ def load_model(model_name, loader=None): 'llama.cpp': llamacpp_loader, 'FlexGen': flexgen_loader, 'RWKV': RWKV_loader, - 'ExLlama': ExLlama_loader + 'ExLlama': ExLlama_loader, + 'ExLlama_HF': ExLlama_HF_loader } if loader is None: @@ -278,6 +279,12 @@ def ExLlama_loader(model_name): return model, tokenizer +def ExLlama_HF_loader(model_name): + from modules.exllama_hf import ExllamaHF + + return ExllamaHF.from_pretrained(model_name) + + def get_max_memory_dict(): max_memory = {} if shared.args.gpu_memory: diff --git a/modules/shared.py b/modules/shared.py index ecc03fc9..e065b76b 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -98,7 +98,7 @@ parser.add_argument('--extensions', type=str, nargs="+", help='The list of exten parser.add_argument('--verbose', action='store_true', help='Print the prompts to the terminal.') # Model loader -parser.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, llamacpp, rwkv, flexgen') +parser.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, exllama_hf, llamacpp, rwkv, flexgen') # Accelerate/transformers parser.add_argument('--cpu', action='store_true', help='Use the CPU to generate text. Warning: Training on CPU is extremely slow.') @@ -218,6 +218,8 @@ def fix_loader_name(name): return 'GPTQ-for-LLaMa' elif name in ['exllama', 'ex-llama', 'ex_llama', 'exlama']: return 'ExLlama' + elif name in ['exllama-hf', 'exllama_hf', 'exllama hf', 'ex-llama-hf', 'ex_llama_hf']: + return 'ExLlama_HF' if args.loader is not None: diff --git a/modules/text_generation.py b/modules/text_generation.py index 0d2f55c2..d0965b8a 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -104,9 +104,8 @@ def get_reply_from_output_ids(output_ids, input_ids, original_question, state, i else: new_tokens = len(output_ids) - len(input_ids[0]) reply = decode(output_ids[-new_tokens:], state['skip_special_tokens']) - # Prevent LlamaTokenizer from skipping a space - if type(shared.tokenizer) is transformers.LlamaTokenizer and len(output_ids) > 0: + if type(shared.tokenizer) in [transformers.LlamaTokenizer, transformers.LlamaTokenizerFast] and len(output_ids) > 0: if shared.tokenizer.convert_ids_to_tokens(int(output_ids[-new_tokens])).startswith('▁'): reply = ' ' + reply diff --git a/server.py b/server.py index b6699f14..ff4f2c3f 100644 --- a/server.py +++ b/server.py @@ -197,7 +197,7 @@ def create_model_menus(): with gr.Row(): with gr.Column(): - shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=["Transformers", "AutoGPTQ", "GPTQ-for-LLaMa", "ExLlama", "llama.cpp"], value=None) + shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=["Transformers", "AutoGPTQ", "GPTQ-for-LLaMa", "ExLlama", "ExLlama_HF", "llama.cpp"], value=None) with gr.Box(): with gr.Row(): with gr.Column(): @@ -237,6 +237,7 @@ def create_model_menus(): shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Make sure to inspect the .py files inside the model folder before loading it with this option enabled.') shared.gradio['gptq_for_llama_info'] = gr.Markdown('GPTQ-for-LLaMa is currently 2x faster than AutoGPTQ on some systems. It is installed by default with the one-click installers. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1).') shared.gradio['exllama_info'] = gr.Markdown('ExLlama has to be installed manually. See the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/ExLlama.md).') + shared.gradio['exllama_HF_info'] = gr.Markdown('ExLlama_HF is a wrapper that lets you use ExLlama like a Transformers model, which means it can use the Transformers samplers. It\'s still a bit buggy, so feel free to help out by fixing issues.\n\nCheck out PR [#2777](https://github.com/oobabooga/text-generation-webui/pull/2777) for more details.') with gr.Column(): with gr.Row():