From e17af5926156c1f9f9d793b29bc360bece97a5a0 Mon Sep 17 00:00:00 2001 From: Nikita Skakun Date: Thu, 30 Mar 2023 00:21:34 -0700 Subject: [PATCH 01/42] Add support for resuming downloads This commit adds the ability to resume interrupted downloads by adding a new function to the downloader module. The function uses the HTTP Range header to fetch only the remaining part of a file that wasn't downloaded yet. --- download-model.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/download-model.py b/download-model.py index 7e5f61b2..94524f76 100644 --- a/download-model.py +++ b/download-model.py @@ -27,8 +27,23 @@ parser.add_argument('--output', type=str, default=None, help='The folder where t args = parser.parse_args() def get_file(url, output_folder): - r = requests.get(url, stream=True) - with open(output_folder / Path(url.rsplit('/', 1)[1]), 'wb') as f: + filename = Path(url.rsplit('/', 1)[1]) + output_path = output_folder / filename + if output_path.exists(): + # Check if the file has already been downloaded completely + r = requests.head(url) + total_size = int(r.headers.get('content-length', 0)) + if output_path.stat().st_size == total_size: + return + # Otherwise, resume the download from where it left off + headers = {'Range': f'bytes={output_path.stat().st_size}-'} + mode = 'ab' + else: + headers = {} + mode = 'wb' + + r = requests.get(url, stream=True, headers=headers) + with open(output_path, mode) as f: total_size = int(r.headers.get('content-length', 0)) block_size = 1024 with tqdm.tqdm(total=total_size, unit='iB', unit_scale=True, bar_format='{l_bar}{bar}| {n_fmt:6}/{total_fmt:6} {rate_fmt:6}') as t: @@ -149,7 +164,7 @@ def get_download_links_from_huggingface(model, branch): return links, sha256, is_lora def download_files(file_list, output_folder, num_threads=8): - thread_map(lambda url: get_file(url, output_folder), file_list, max_workers=num_threads) + thread_map(lambda url: get_file(url, output_folder), file_list, max_workers=num_threads, disable=True) if __name__ == '__main__': model = args.MODEL From 8c590c2362d4ee783b41a93f6c03f8e19dc40657 Mon Sep 17 00:00:00 2001 From: Nikita Skakun Date: Thu, 30 Mar 2023 00:42:19 -0700 Subject: [PATCH 02/42] Added a 'clean' flag to not resume download. --- download-model.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/download-model.py b/download-model.py index 94524f76..ed8a047d 100644 --- a/download-model.py +++ b/download-model.py @@ -24,12 +24,13 @@ parser.add_argument('--branch', type=str, default='main', help='Name of the Git parser.add_argument('--threads', type=int, default=1, help='Number of files to download simultaneously.') parser.add_argument('--text-only', action='store_true', help='Only download text files (txt/json).') parser.add_argument('--output', type=str, default=None, help='The folder where the model should be saved.') +parser.add_argument('--clean', action='store_true', help='Does not resume the previous download.') args = parser.parse_args() def get_file(url, output_folder): filename = Path(url.rsplit('/', 1)[1]) output_path = output_folder / filename - if output_path.exists(): + if output_path.exists() and not args.clean: # Check if the file has already been downloaded completely r = requests.head(url) total_size = int(r.headers.get('content-length', 0)) From 297ac051d91c52ee6e0a39bc745d84f85a305346 Mon Sep 17 00:00:00 2001 From: Nikita Skakun Date: Thu, 30 Mar 2023 02:34:19 -0700 Subject: [PATCH 03/42] Added sha256 validation of model files. --- download-model.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/download-model.py b/download-model.py index ed8a047d..0b3f16d7 100644 --- a/download-model.py +++ b/download-model.py @@ -17,6 +17,7 @@ from pathlib import Path import requests import tqdm from tqdm.contrib.concurrent import thread_map +import hashlib parser = argparse.ArgumentParser() parser.add_argument('MODEL', type=str, default=None, nargs='?') @@ -213,4 +214,17 @@ if __name__ == '__main__': # Downloading the files print(f"Downloading the model to {output_folder}") download_files(links, output_folder, args.threads) - print() + + print('\n') + # Validate the checksums + validated = True + for i in range(len(sha256)): + with open(output_folder / sha256[i][0], "rb") as f: + bytes = f.read() + file_hash = hashlib.sha256(bytes).hexdigest() + if file_hash != sha256[i][1]: + print(f'[!] Checksum for {sha256[i][0]} failed!') + validated = False + + if validated: + print('[+] Validated checksums of all model files!') \ No newline at end of file From 53ab1e285d37340e660adff6a560f1b95463aa29 Mon Sep 17 00:00:00 2001 From: Thomas Antony Date: Sun, 19 Mar 2023 19:52:08 -0700 Subject: [PATCH 04/42] Update .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index a9c47a5a..aec1f1cf 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +.vscode cache characters training/datasets From 7a562481fa3eb73455c7aabdf24f19673e13fc18 Mon Sep 17 00:00:00 2001 From: Thomas Antony Date: Sat, 18 Mar 2023 23:42:10 -0700 Subject: [PATCH 05/42] Initial version of llamacpp_model.py --- modules/llamacpp_model.py | 94 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 modules/llamacpp_model.py diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py new file mode 100644 index 00000000..21415fa2 --- /dev/null +++ b/modules/llamacpp_model.py @@ -0,0 +1,94 @@ +import os +from pathlib import Path +import modules.shared as shared +from modules.callbacks import Iteratorize + +import llamacpp + + +class LlamaCppTokenizer: + """A thin wrapper over the llamacpp tokenizer""" + def __init__(self, model: llamacpp.PyLLAMA): + self._tokenizer = model.get_tokenizer() + self.eos_token_id = 2 + self.bos_token_id = 0 + + @classmethod + def from_model(cls, model: llamacpp.PyLLAMA): + return cls(model) + + def encode(self, prompt): + return self._tokenizer.tokenize(prompt) + + def decode(self, ids): + return self._tokenizer.detokenize(ids) + + +class LlamaCppModel: + def __init__(self): + self.initialized = False + + @classmethod + def from_pretrained(self, path): + params = llamacpp.gpt_params( + str(path), # model + 2048, # ctx_size + 200, # n_predict + 40, # top_k + 0.95, # top_p + 0.80, # temp + 1.30, # repeat_penalty + -1, # seed + 8, # threads + 64, # repeat_last_n + 8, # batch_size + ) + + _model = llamacpp.PyLLAMA(params) + + result = self() + result.model = _model + + tokenizer = LlamaCppTokenizer.from_model(_model) + return result, tokenizer + + # TODO: Allow passing in params for each inference + def generate(self, context="", num_tokens=10, callback=None): + # params = self.params + # params.n_predict = token_count + # params.top_p = top_p + # params.top_k = top_k + # params.temp = temperature + # params.repeat_penalty = repetition_penalty + # params.repeat_last_n = repeat_last_n + + # model.params = params + if not self.initialized: + self.model.add_bos() + + self.model.update_input(context) + if not self.initialized: + self.model.prepare_context() + self.initialized = True + + output = "" + is_end_of_text = False + ctr = 0 + while not self.model.is_finished() and ctr < num_tokens and not is_end_of_text: + if self.model.has_unconsumed_input(): + self.model.ingest_all_pending_input(False) + else: + text, is_end_of_text = self.model.infer_text() + if callback: + callback(text) + output += text + ctr += 1 + + return output + + def generate_with_streaming(self, **kwargs): + with Iteratorize(self.generate, kwargs, callback=None) as generator: + reply = kwargs['context'] + for token in generator: + reply += token + yield reply From 7745faa7bb39c8f925d6b34d4a61c0a0778e13c0 Mon Sep 17 00:00:00 2001 From: Thomas Antony Date: Sat, 18 Mar 2023 23:42:28 -0700 Subject: [PATCH 06/42] Add llamacpp to models.py --- modules/models.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/modules/models.py b/modules/models.py index b19507db..c60af8e2 100644 --- a/modules/models.py +++ b/modules/models.py @@ -42,9 +42,10 @@ def load_model(model_name): t0 = time.time() shared.is_RWKV = 'rwkv-' in model_name.lower() + shared.is_llamacpp = model_name.lower().startswith('llamacpp-') # Default settings - if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.wbits, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.is_RWKV]): + if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.wbits, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.is_RWKV, shared.is_llamacpp]): if any(size in shared.model_name.lower() for size in ('13b', '20b', '30b')): model = AutoModelForCausalLM.from_pretrained(Path(f"{shared.args.model_dir}/{shared.model_name}"), device_map='auto', load_in_8bit=True) else: @@ -100,6 +101,12 @@ def load_model(model_name): model = load_quantized(model_name) + # LLAMACPP model + elif shared.is_llamacpp: + from modules.llamacpp_model import LlamaCppModel + model, tokenizer = LlamaCppModel.from_pretrained(Path(f'models/{model_name}/ggml-model-q4_0.bin')) + return model, tokenizer + # Custom else: params = {"low_cpu_mem_usage": True} From a5f5736e748bad56ebd9c9c88d1cfa6f3fde97db Mon Sep 17 00:00:00 2001 From: Thomas Antony Date: Sun, 19 Mar 2023 19:51:43 -0700 Subject: [PATCH 07/42] Add to text_generation.py --- modules/text_generation.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/modules/text_generation.py b/modules/text_generation.py index 7b5fcd6a..e18a76d7 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -22,7 +22,7 @@ def get_max_prompt_length(tokens): return max_length def encode(prompt, tokens_to_generate=0, add_special_tokens=True): - if shared.is_RWKV: + if shared.is_RWKV or shared.is_llamacpp: input_ids = shared.tokenizer.encode(str(prompt)) input_ids = np.array(input_ids).reshape(1, len(input_ids)) return input_ids @@ -142,6 +142,24 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi input_ids = encode(question) print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(input_ids[0])} tokens)") return + elif shared.is_llamacpp: + try: + if shared.args.no_stream: + reply = shared.model.generate(context=question, num_tokens=max_new_tokens) + yield formatted_outputs(reply, shared.model_name) + else: + if not (shared.args.chat or shared.args.cai_chat): + yield formatted_outputs(question, shared.model_name) + for reply in shared.model.generate_with_streaming(context=question, num_tokens=max_new_tokens): + yield formatted_outputs(reply, shared.model_name) + except Exception as e: + print(e) + finally: + t1 = time.time() + output = encode(reply)[0] + input_ids = encode(question) + print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(input_ids[0])} tokens)") + return input_ids = encode(question, max_new_tokens) original_input_ids = input_ids From 8953a262cb25a0dc3d5c486aba0e3f4175d83ffb Mon Sep 17 00:00:00 2001 From: Thomas Antony Date: Sun, 19 Mar 2023 19:59:25 -0700 Subject: [PATCH 08/42] Add llamacpp to requirements.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 79da715d..e92c6889 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,5 @@ safetensors==0.3.0 sentencepiece tqdm datasets +llamacpp>=0.1.9 git+https://github.com/huggingface/transformers From 79fa2b6d7e338a61ed978fb1e5411838779e3761 Mon Sep 17 00:00:00 2001 From: Thomas Antony Date: Sun, 19 Mar 2023 21:30:24 -0700 Subject: [PATCH 09/42] Add support for alpaca --- modules/models.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/modules/models.py b/modules/models.py index c60af8e2..e9fed4a9 100644 --- a/modules/models.py +++ b/modules/models.py @@ -42,7 +42,8 @@ def load_model(model_name): t0 = time.time() shared.is_RWKV = 'rwkv-' in model_name.lower() - shared.is_llamacpp = model_name.lower().startswith('llamacpp-') + shared.is_llamacpp = model_name.lower().startswith('llamacpp-') or \ + model_name.lower().startswith('alpaca-cpp-') # Default settings if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.wbits, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.is_RWKV, shared.is_llamacpp]): @@ -104,7 +105,13 @@ def load_model(model_name): # LLAMACPP model elif shared.is_llamacpp: from modules.llamacpp_model import LlamaCppModel - model, tokenizer = LlamaCppModel.from_pretrained(Path(f'models/{model_name}/ggml-model-q4_0.bin')) + + if model_name.lower().startswith('alpaca-'): + model_file = f'models/{model_name}/ggml-alpaca-7b-q4.bin' + else: + model_file = f'models/{model_name}/ggml-model-q4_0.bin' + + model, tokenizer = LlamaCppModel.from_pretrained(Path(model_file)) return model, tokenizer # Custom From 7fa5d96c220324c4b43dfe4dfdf1267137fc94cd Mon Sep 17 00:00:00 2001 From: Thomas Antony Date: Wed, 29 Mar 2023 21:20:22 +0100 Subject: [PATCH 10/42] Update to use new llamacpp API --- modules/llamacpp_model.py | 39 +++++++++++++-------------------------- 1 file changed, 13 insertions(+), 26 deletions(-) diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py index 21415fa2..f65ecb4e 100644 --- a/modules/llamacpp_model.py +++ b/modules/llamacpp_model.py @@ -8,16 +8,16 @@ import llamacpp class LlamaCppTokenizer: """A thin wrapper over the llamacpp tokenizer""" - def __init__(self, model: llamacpp.PyLLAMA): + def __init__(self, model: llamacpp.LlamaInference): self._tokenizer = model.get_tokenizer() self.eos_token_id = 2 self.bos_token_id = 0 @classmethod - def from_model(cls, model: llamacpp.PyLLAMA): + def from_model(cls, model: llamacpp.LlamaInference): return cls(model) - def encode(self, prompt): + def encode(self, prompt: str): return self._tokenizer.tokenize(prompt) def decode(self, ids): @@ -30,21 +30,10 @@ class LlamaCppModel: @classmethod def from_pretrained(self, path): - params = llamacpp.gpt_params( - str(path), # model - 2048, # ctx_size - 200, # n_predict - 40, # top_k - 0.95, # top_p - 0.80, # temp - 1.30, # repeat_penalty - -1, # seed - 8, # threads - 64, # repeat_last_n - 8, # batch_size - ) + params = llamacpp.InferenceParams() + params.path_model = str(path) - _model = llamacpp.PyLLAMA(params) + _model = llamacpp.LlamaInference(params) result = self() result.model = _model @@ -63,22 +52,20 @@ class LlamaCppModel: # params.repeat_last_n = repeat_last_n # model.params = params - if not self.initialized: - self.model.add_bos() - + self.model.add_bos() self.model.update_input(context) - if not self.initialized: - self.model.prepare_context() - self.initialized = True output = "" is_end_of_text = False ctr = 0 - while not self.model.is_finished() and ctr < num_tokens and not is_end_of_text: + while ctr < num_tokens and not is_end_of_text: if self.model.has_unconsumed_input(): - self.model.ingest_all_pending_input(False) + self.model.ingest_all_pending_input() else: - text, is_end_of_text = self.model.infer_text() + self.model.eval() + token = self.model.sample() + text = self.model.token_to_str(token) + is_end_of_text = token == self.model.token_eos() if callback: callback(text) output += text From d550c12a3eda0aeceacc8013f9f77808d9f524be Mon Sep 17 00:00:00 2001 From: Nikita Skakun Date: Thu, 30 Mar 2023 12:52:16 -0700 Subject: [PATCH 11/42] Fixed the bug with additional bytes. The issue seems to be with huggingface not reporting the entire size of the model. Added an error message with instructions if the checksums don't match. --- download-model.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/download-model.py b/download-model.py index 0b3f16d7..52cdae81 100644 --- a/download-model.py +++ b/download-model.py @@ -35,7 +35,7 @@ def get_file(url, output_folder): # Check if the file has already been downloaded completely r = requests.head(url) total_size = int(r.headers.get('content-length', 0)) - if output_path.stat().st_size == total_size: + if output_path.stat().st_size >= total_size: return # Otherwise, resume the download from where it left off headers = {'Range': f'bytes={output_path.stat().st_size}-'} @@ -215,7 +215,6 @@ if __name__ == '__main__': print(f"Downloading the model to {output_folder}") download_files(links, output_folder, args.threads) - print('\n') # Validate the checksums validated = True for i in range(len(sha256)): @@ -227,4 +226,6 @@ if __name__ == '__main__': validated = False if validated: - print('[+] Validated checksums of all model files!') \ No newline at end of file + print('[+] Validated checksums of all model files!') + else: + print('[-] Rerun the download-model.py with --clean flag') \ No newline at end of file From bb69e054a7241798a8bc0c784747653aa753daab Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 30 Mar 2023 21:08:50 -0300 Subject: [PATCH 12/42] Add dummy file --- loras/place-your-loras-here.txt | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 loras/place-your-loras-here.txt diff --git a/loras/place-your-loras-here.txt b/loras/place-your-loras-here.txt new file mode 100644 index 00000000..e69de29b From f9940b79dc9f67477cec3819dbfb250f5e92a658 Mon Sep 17 00:00:00 2001 From: ye7iaserag Date: Fri, 31 Mar 2023 04:56:49 +0200 Subject: [PATCH 13/42] Implement character gallery using Dataset --- extensions/gallery/script.py | 38 +++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/extensions/gallery/script.py b/extensions/gallery/script.py index fbf23bc9..e96fe40a 100644 --- a/extensions/gallery/script.py +++ b/extensions/gallery/script.py @@ -1,20 +1,23 @@ from pathlib import Path - import gradio as gr - from modules.html_generator import get_image_cache +from modules.chat import load_character +from modules.shared import gradio, settings - -def generate_html(): +def generate_css(): css = """ - .character-gallery { + .character-gallery > .gallery { margin: 1rem 0; - display: grid; + display: grid !important; grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); grid-column-gap: 0.4rem; grid-row-gap: 1.2rem; } + .character-gallery > .label { + display: none !important; + } + .character-container { cursor: pointer; text-align: center; @@ -45,14 +48,16 @@ def generate_html(): overflow-wrap: anywhere; } """ + return css - container_html = f'