From 53ab1e285d37340e660adff6a560f1b95463aa29 Mon Sep 17 00:00:00 2001 From: Thomas Antony Date: Sun, 19 Mar 2023 19:52:08 -0700 Subject: [PATCH 01/11] Update .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index a9c47a5a..aec1f1cf 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +.vscode cache characters training/datasets From 7a562481fa3eb73455c7aabdf24f19673e13fc18 Mon Sep 17 00:00:00 2001 From: Thomas Antony Date: Sat, 18 Mar 2023 23:42:10 -0700 Subject: [PATCH 02/11] Initial version of llamacpp_model.py --- modules/llamacpp_model.py | 94 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 modules/llamacpp_model.py diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py new file mode 100644 index 00000000..21415fa2 --- /dev/null +++ b/modules/llamacpp_model.py @@ -0,0 +1,94 @@ +import os +from pathlib import Path +import modules.shared as shared +from modules.callbacks import Iteratorize + +import llamacpp + + +class LlamaCppTokenizer: + """A thin wrapper over the llamacpp tokenizer""" + def __init__(self, model: llamacpp.PyLLAMA): + self._tokenizer = model.get_tokenizer() + self.eos_token_id = 2 + self.bos_token_id = 0 + + @classmethod + def from_model(cls, model: llamacpp.PyLLAMA): + return cls(model) + + def encode(self, prompt): + return self._tokenizer.tokenize(prompt) + + def decode(self, ids): + return self._tokenizer.detokenize(ids) + + +class LlamaCppModel: + def __init__(self): + self.initialized = False + + @classmethod + def from_pretrained(self, path): + params = llamacpp.gpt_params( + str(path), # model + 2048, # ctx_size + 200, # n_predict + 40, # top_k + 0.95, # top_p + 0.80, # temp + 1.30, # repeat_penalty + -1, # seed + 8, # threads + 64, # repeat_last_n + 8, # batch_size + ) + + _model = llamacpp.PyLLAMA(params) + + result = self() + result.model = _model + + tokenizer = LlamaCppTokenizer.from_model(_model) + return result, tokenizer + + # TODO: Allow passing in params for each inference + def generate(self, context="", num_tokens=10, callback=None): + # params = self.params + # params.n_predict = token_count + # params.top_p = top_p + # params.top_k = top_k + # params.temp = temperature + # params.repeat_penalty = repetition_penalty + # params.repeat_last_n = repeat_last_n + + # model.params = params + if not self.initialized: + self.model.add_bos() + + self.model.update_input(context) + if not self.initialized: + self.model.prepare_context() + self.initialized = True + + output = "" + is_end_of_text = False + ctr = 0 + while not self.model.is_finished() and ctr < num_tokens and not is_end_of_text: + if self.model.has_unconsumed_input(): + self.model.ingest_all_pending_input(False) + else: + text, is_end_of_text = self.model.infer_text() + if callback: + callback(text) + output += text + ctr += 1 + + return output + + def generate_with_streaming(self, **kwargs): + with Iteratorize(self.generate, kwargs, callback=None) as generator: + reply = kwargs['context'] + for token in generator: + reply += token + yield reply From 7745faa7bb39c8f925d6b34d4a61c0a0778e13c0 Mon Sep 17 00:00:00 2001 From: Thomas Antony Date: Sat, 18 Mar 2023 23:42:28 -0700 Subject: [PATCH 03/11] Add llamacpp to models.py --- modules/models.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/modules/models.py b/modules/models.py index b19507db..c60af8e2 100644 --- a/modules/models.py +++ b/modules/models.py @@ -42,9 +42,10 @@ def load_model(model_name): t0 = time.time() shared.is_RWKV = 'rwkv-' in model_name.lower() + shared.is_llamacpp = model_name.lower().startswith('llamacpp-') # Default settings - if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.wbits, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.is_RWKV]): + if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.wbits, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.is_RWKV, shared.is_llamacpp]): if any(size in shared.model_name.lower() for size in ('13b', '20b', '30b')): model = AutoModelForCausalLM.from_pretrained(Path(f"{shared.args.model_dir}/{shared.model_name}"), device_map='auto', load_in_8bit=True) else: @@ -100,6 +101,12 @@ def load_model(model_name): model = load_quantized(model_name) + # LLAMACPP model + elif shared.is_llamacpp: + from modules.llamacpp_model import LlamaCppModel + model, tokenizer = LlamaCppModel.from_pretrained(Path(f'models/{model_name}/ggml-model-q4_0.bin')) + return model, tokenizer + # Custom else: params = {"low_cpu_mem_usage": True} From a5f5736e748bad56ebd9c9c88d1cfa6f3fde97db Mon Sep 17 00:00:00 2001 From: Thomas Antony Date: Sun, 19 Mar 2023 19:51:43 -0700 Subject: [PATCH 04/11] Add to text_generation.py --- modules/text_generation.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/modules/text_generation.py b/modules/text_generation.py index 7b5fcd6a..e18a76d7 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -22,7 +22,7 @@ def get_max_prompt_length(tokens): return max_length def encode(prompt, tokens_to_generate=0, add_special_tokens=True): - if shared.is_RWKV: + if shared.is_RWKV or shared.is_llamacpp: input_ids = shared.tokenizer.encode(str(prompt)) input_ids = np.array(input_ids).reshape(1, len(input_ids)) return input_ids @@ -142,6 +142,24 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi input_ids = encode(question) print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(input_ids[0])} tokens)") return + elif shared.is_llamacpp: + try: + if shared.args.no_stream: + reply = shared.model.generate(context=question, num_tokens=max_new_tokens) + yield formatted_outputs(reply, shared.model_name) + else: + if not (shared.args.chat or shared.args.cai_chat): + yield formatted_outputs(question, shared.model_name) + for reply in shared.model.generate_with_streaming(context=question, num_tokens=max_new_tokens): + yield formatted_outputs(reply, shared.model_name) + except Exception as e: + print(e) + finally: + t1 = time.time() + output = encode(reply)[0] + input_ids = encode(question) + print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(input_ids[0])} tokens)") + return input_ids = encode(question, max_new_tokens) original_input_ids = input_ids From 8953a262cb25a0dc3d5c486aba0e3f4175d83ffb Mon Sep 17 00:00:00 2001 From: Thomas Antony Date: Sun, 19 Mar 2023 19:59:25 -0700 Subject: [PATCH 05/11] Add llamacpp to requirements.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 79da715d..e92c6889 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,5 @@ safetensors==0.3.0 sentencepiece tqdm datasets +llamacpp>=0.1.9 git+https://github.com/huggingface/transformers From 79fa2b6d7e338a61ed978fb1e5411838779e3761 Mon Sep 17 00:00:00 2001 From: Thomas Antony Date: Sun, 19 Mar 2023 21:30:24 -0700 Subject: [PATCH 06/11] Add support for alpaca --- modules/models.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/modules/models.py b/modules/models.py index c60af8e2..e9fed4a9 100644 --- a/modules/models.py +++ b/modules/models.py @@ -42,7 +42,8 @@ def load_model(model_name): t0 = time.time() shared.is_RWKV = 'rwkv-' in model_name.lower() - shared.is_llamacpp = model_name.lower().startswith('llamacpp-') + shared.is_llamacpp = model_name.lower().startswith('llamacpp-') or \ + model_name.lower().startswith('alpaca-cpp-') # Default settings if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.wbits, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.is_RWKV, shared.is_llamacpp]): @@ -104,7 +105,13 @@ def load_model(model_name): # LLAMACPP model elif shared.is_llamacpp: from modules.llamacpp_model import LlamaCppModel - model, tokenizer = LlamaCppModel.from_pretrained(Path(f'models/{model_name}/ggml-model-q4_0.bin')) + + if model_name.lower().startswith('alpaca-'): + model_file = f'models/{model_name}/ggml-alpaca-7b-q4.bin' + else: + model_file = f'models/{model_name}/ggml-model-q4_0.bin' + + model, tokenizer = LlamaCppModel.from_pretrained(Path(model_file)) return model, tokenizer # Custom From 7fa5d96c220324c4b43dfe4dfdf1267137fc94cd Mon Sep 17 00:00:00 2001 From: Thomas Antony Date: Wed, 29 Mar 2023 21:20:22 +0100 Subject: [PATCH 07/11] Update to use new llamacpp API --- modules/llamacpp_model.py | 39 +++++++++++++-------------------------- 1 file changed, 13 insertions(+), 26 deletions(-) diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py index 21415fa2..f65ecb4e 100644 --- a/modules/llamacpp_model.py +++ b/modules/llamacpp_model.py @@ -8,16 +8,16 @@ import llamacpp class LlamaCppTokenizer: """A thin wrapper over the llamacpp tokenizer""" - def __init__(self, model: llamacpp.PyLLAMA): + def __init__(self, model: llamacpp.LlamaInference): self._tokenizer = model.get_tokenizer() self.eos_token_id = 2 self.bos_token_id = 0 @classmethod - def from_model(cls, model: llamacpp.PyLLAMA): + def from_model(cls, model: llamacpp.LlamaInference): return cls(model) - def encode(self, prompt): + def encode(self, prompt: str): return self._tokenizer.tokenize(prompt) def decode(self, ids): @@ -30,21 +30,10 @@ class LlamaCppModel: @classmethod def from_pretrained(self, path): - params = llamacpp.gpt_params( - str(path), # model - 2048, # ctx_size - 200, # n_predict - 40, # top_k - 0.95, # top_p - 0.80, # temp - 1.30, # repeat_penalty - -1, # seed - 8, # threads - 64, # repeat_last_n - 8, # batch_size - ) + params = llamacpp.InferenceParams() + params.path_model = str(path) - _model = llamacpp.PyLLAMA(params) + _model = llamacpp.LlamaInference(params) result = self() result.model = _model @@ -63,22 +52,20 @@ class LlamaCppModel: # params.repeat_last_n = repeat_last_n # model.params = params - if not self.initialized: - self.model.add_bos() - + self.model.add_bos() self.model.update_input(context) - if not self.initialized: - self.model.prepare_context() - self.initialized = True output = "" is_end_of_text = False ctr = 0 - while not self.model.is_finished() and ctr < num_tokens and not is_end_of_text: + while ctr < num_tokens and not is_end_of_text: if self.model.has_unconsumed_input(): - self.model.ingest_all_pending_input(False) + self.model.ingest_all_pending_input() else: - text, is_end_of_text = self.model.infer_text() + self.model.eval() + token = self.model.sample() + text = self.model.token_to_str(token) + is_end_of_text = token == self.model.token_eos() if callback: callback(text) output += text From 9d1dcf880aa928524385d82baabc2ff262206f2e Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 31 Mar 2023 14:27:01 -0300 Subject: [PATCH 08/11] General improvements --- modules/llamacpp_model.py | 29 ++++++++++++++--------------- modules/text_generation.py | 22 ++-------------------- requirements.txt | 2 +- 3 files changed, 17 insertions(+), 36 deletions(-) diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py index f65ecb4e..6b9b1b52 100644 --- a/modules/llamacpp_model.py +++ b/modules/llamacpp_model.py @@ -1,10 +1,10 @@ -import os from pathlib import Path -import modules.shared as shared -from modules.callbacks import Iteratorize import llamacpp +import modules.shared as shared +from modules.callbacks import Iteratorize + class LlamaCppTokenizer: """A thin wrapper over the llamacpp tokenizer""" @@ -37,19 +37,19 @@ class LlamaCppModel: result = self() result.model = _model + result.params = params tokenizer = LlamaCppTokenizer.from_model(_model) return result, tokenizer - # TODO: Allow passing in params for each inference - def generate(self, context="", num_tokens=10, callback=None): - # params = self.params - # params.n_predict = token_count - # params.top_p = top_p - # params.top_k = top_k - # params.temp = temperature - # params.repeat_penalty = repetition_penalty - # params.repeat_last_n = repeat_last_n + def generate(self, context="", token_count=20, temperature=1, top_p=1, top_k=50, repetition_penalty=1, callback=None): + params = self.params + params.n_predict = token_count + params.top_p = top_p + params.top_k = top_k + params.temp = temperature + params.repeat_penalty = repetition_penalty + #params.repeat_last_n = repeat_last_n # model.params = params self.model.add_bos() @@ -58,7 +58,7 @@ class LlamaCppModel: output = "" is_end_of_text = False ctr = 0 - while ctr < num_tokens and not is_end_of_text: + while ctr < token_count and not is_end_of_text: if self.model.has_unconsumed_input(): self.model.ingest_all_pending_input() else: @@ -68,14 +68,13 @@ class LlamaCppModel: is_end_of_text = token == self.model.token_eos() if callback: callback(text) - output += text ctr += 1 return output def generate_with_streaming(self, **kwargs): with Iteratorize(self.generate, kwargs, callback=None) as generator: - reply = kwargs['context'] + reply = '' for token in generator: reply += token yield reply diff --git a/modules/text_generation.py b/modules/text_generation.py index e18a76d7..8d54961e 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -22,7 +22,7 @@ def get_max_prompt_length(tokens): return max_length def encode(prompt, tokens_to_generate=0, add_special_tokens=True): - if shared.is_RWKV or shared.is_llamacpp: + if any((shared.is_RWKV, shared.is_llamacpp)): input_ids = shared.tokenizer.encode(str(prompt)) input_ids = np.array(input_ids).reshape(1, len(input_ids)) return input_ids @@ -116,7 +116,7 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi # These models are not part of Hugging Face, so we handle them # separately and terminate the function call earlier - if shared.is_RWKV: + if any((shared.is_RWKV, shared.is_llamacpp)): try: if shared.args.no_stream: reply = shared.model.generate(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k) @@ -142,24 +142,6 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi input_ids = encode(question) print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(input_ids[0])} tokens)") return - elif shared.is_llamacpp: - try: - if shared.args.no_stream: - reply = shared.model.generate(context=question, num_tokens=max_new_tokens) - yield formatted_outputs(reply, shared.model_name) - else: - if not (shared.args.chat or shared.args.cai_chat): - yield formatted_outputs(question, shared.model_name) - for reply in shared.model.generate_with_streaming(context=question, num_tokens=max_new_tokens): - yield formatted_outputs(reply, shared.model_name) - except Exception as e: - print(e) - finally: - t1 = time.time() - output = encode(reply)[0] - input_ids = encode(question) - print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(input_ids[0])} tokens)") - return input_ids = encode(question, max_new_tokens) original_input_ids = input_ids diff --git a/requirements.txt b/requirements.txt index e92c6889..08ee5d58 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ accelerate==0.18.0 bitsandbytes==0.37.2 flexgen==0.1.7 gradio==3.23.0 +llamacpp==0.1.10 markdown numpy peft==0.2.0 @@ -11,5 +12,4 @@ safetensors==0.3.0 sentencepiece tqdm datasets -llamacpp>=0.1.9 git+https://github.com/huggingface/transformers From 4c275621572bc6719ebc30f715184bb4d5477e38 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 31 Mar 2023 14:33:46 -0300 Subject: [PATCH 09/11] Minor changes --- .gitignore | 2 +- modules/models.py | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index aec1f1cf..bfb6d027 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,3 @@ -.vscode cache characters training/datasets @@ -15,6 +14,7 @@ torch-dumps */*/pycache* venv/ .venv/ +.vscode repositories settings.json diff --git a/modules/models.py b/modules/models.py index e9fed4a9..80bbcab2 100644 --- a/modules/models.py +++ b/modules/models.py @@ -42,8 +42,7 @@ def load_model(model_name): t0 = time.time() shared.is_RWKV = 'rwkv-' in model_name.lower() - shared.is_llamacpp = model_name.lower().startswith('llamacpp-') or \ - model_name.lower().startswith('alpaca-cpp-') + shared.is_llamacpp = model_name.lower().startswith(('llamacpp', 'alpaca-cpp')) # Default settings if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.wbits, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.is_RWKV, shared.is_llamacpp]): @@ -102,11 +101,11 @@ def load_model(model_name): model = load_quantized(model_name) - # LLAMACPP model + # llamacpp model elif shared.is_llamacpp: from modules.llamacpp_model import LlamaCppModel - if model_name.lower().startswith('alpaca-'): + if model_name.lower().startswith('alpaca-cpp'): model_file = f'models/{model_name}/ggml-alpaca-7b-q4.bin' else: model_file = f'models/{model_name}/ggml-model-q4_0.bin' From 09b0a3aafb1a3b2d86912db0114b84ad3bc6029a Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 31 Mar 2023 14:45:17 -0300 Subject: [PATCH 10/11] Add repetition_penalty --- modules/RWKV.py | 2 +- modules/text_generation.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/RWKV.py b/modules/RWKV.py index 8c7ea2b9..10c4c366 100644 --- a/modules/RWKV.py +++ b/modules/RWKV.py @@ -34,7 +34,7 @@ class RWKVModel: result.pipeline = pipeline return result - def generate(self, context="", token_count=20, temperature=1, top_p=1, top_k=50, alpha_frequency=0.1, alpha_presence=0.1, token_ban=[0], token_stop=[], callback=None): + def generate(self, context="", token_count=20, temperature=1, top_p=1, top_k=50, repetition_penalty=None, alpha_frequency=0.1, alpha_presence=0.1, token_ban=[0], token_stop=[], callback=None): args = PIPELINE_ARGS( temperature = temperature, top_p = top_p, diff --git a/modules/text_generation.py b/modules/text_generation.py index 8d54961e..b8b2f496 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -119,7 +119,7 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi if any((shared.is_RWKV, shared.is_llamacpp)): try: if shared.args.no_stream: - reply = shared.model.generate(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k) + reply = shared.model.generate(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k, repetition_penalty=repetition_penalty) if not (shared.args.chat or shared.args.cai_chat): reply = original_question + apply_extensions(reply, "output") yield formatted_outputs(reply, shared.model_name) From a5c9b7d97763acda0ebc4db84ec6a2adfc093106 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 31 Mar 2023 15:08:01 -0300 Subject: [PATCH 11/11] Bump llamacpp version --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 29ede4c6..ffa6b51a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ accelerate==0.18.0 bitsandbytes==0.37.2 flexgen==0.1.7 gradio==3.24.0 -llamacpp==0.1.10 +llamacpp==0.1.11 markdown numpy peft==0.2.0