From 53ab1e285d37340e660adff6a560f1b95463aa29 Mon Sep 17 00:00:00 2001
From: Thomas Antony <mail@thomasantony.com>
Date: Sun, 19 Mar 2023 19:52:08 -0700
Subject: [PATCH 01/11] Update .gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index a9c47a5a..aec1f1cf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
+.vscode
 cache
 characters
 training/datasets

From 7a562481fa3eb73455c7aabdf24f19673e13fc18 Mon Sep 17 00:00:00 2001
From: Thomas Antony <mail@thomasantony.com>
Date: Sat, 18 Mar 2023 23:42:10 -0700
Subject: [PATCH 02/11] Initial version of llamacpp_model.py

---
 modules/llamacpp_model.py | 94 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)
 create mode 100644 modules/llamacpp_model.py

diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py
new file mode 100644
index 00000000..21415fa2
--- /dev/null
+++ b/modules/llamacpp_model.py
@@ -0,0 +1,94 @@
+import os
+from pathlib import Path
+import modules.shared as shared
+from modules.callbacks import Iteratorize
+
+import llamacpp
+
+
+class LlamaCppTokenizer:
+    """A thin wrapper over the llamacpp tokenizer"""
+    def __init__(self, model: llamacpp.PyLLAMA):
+        self._tokenizer = model.get_tokenizer()
+        self.eos_token_id = 2
+        self.bos_token_id = 0
+
+    @classmethod
+    def from_model(cls, model: llamacpp.PyLLAMA):
+        return cls(model)
+
+    def encode(self, prompt):
+        return self._tokenizer.tokenize(prompt)
+
+    def decode(self, ids):
+        return self._tokenizer.detokenize(ids)
+
+
+class LlamaCppModel:
+    def __init__(self):
+        self.initialized = False
+
+    @classmethod
+    def from_pretrained(self, path):
+        params = llamacpp.gpt_params(
+            str(path),  # model
+            2048,  # ctx_size
+            200,  # n_predict
+            40,  # top_k
+            0.95,  # top_p
+            0.80,  # temp
+            1.30,  # repeat_penalty
+            -1,  # seed
+            8,  # threads
+            64,  # repeat_last_n
+            8,  # batch_size
+        )
+
+        _model = llamacpp.PyLLAMA(params)
+
+        result = self()
+        result.model = _model
+
+        tokenizer = LlamaCppTokenizer.from_model(_model)
+        return result, tokenizer
+
+    # TODO: Allow passing in params for each inference
+    def generate(self, context="", num_tokens=10, callback=None):
+        # params = self.params
+        # params.n_predict = token_count
+        # params.top_p = top_p
+        # params.top_k = top_k
+        # params.temp = temperature
+        # params.repeat_penalty = repetition_penalty
+        # params.repeat_last_n = repeat_last_n
+
+        # model.params = params
+        if not self.initialized:
+            self.model.add_bos()
+
+        self.model.update_input(context)
+        if not self.initialized:
+            self.model.prepare_context()
+            self.initialized = True
+
+        output = ""
+        is_end_of_text = False
+        ctr = 0
+        while not self.model.is_finished() and ctr < num_tokens and not is_end_of_text:
+            if self.model.has_unconsumed_input():
+                self.model.ingest_all_pending_input(False)
+            else:
+                text, is_end_of_text = self.model.infer_text()
+                if callback:
+                    callback(text)
+                output += text
+                ctr += 1
+
+        return output
+
+    def generate_with_streaming(self, **kwargs):
+        with Iteratorize(self.generate, kwargs, callback=None) as generator:
+            reply = kwargs['context']
+            for token in generator:
+                reply += token
+                yield reply

From 7745faa7bb39c8f925d6b34d4a61c0a0778e13c0 Mon Sep 17 00:00:00 2001
From: Thomas Antony <mail@thomasantony.com>
Date: Sat, 18 Mar 2023 23:42:28 -0700
Subject: [PATCH 03/11] Add llamacpp to models.py

---
 modules/models.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/modules/models.py b/modules/models.py
index b19507db..c60af8e2 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -42,9 +42,10 @@ def load_model(model_name):
     t0 = time.time()
 
     shared.is_RWKV = 'rwkv-' in model_name.lower()
+    shared.is_llamacpp = model_name.lower().startswith('llamacpp-')
 
     # Default settings
-    if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.wbits, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.is_RWKV]):
+    if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.wbits, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.is_RWKV, shared.is_llamacpp]):
         if any(size in shared.model_name.lower() for size in ('13b', '20b', '30b')):
             model = AutoModelForCausalLM.from_pretrained(Path(f"{shared.args.model_dir}/{shared.model_name}"), device_map='auto', load_in_8bit=True)
         else:
@@ -100,6 +101,12 @@ def load_model(model_name):
 
         model = load_quantized(model_name)
 
+    # LLAMACPP model
+    elif shared.is_llamacpp:
+        from modules.llamacpp_model import LlamaCppModel
+        model, tokenizer = LlamaCppModel.from_pretrained(Path(f'models/{model_name}/ggml-model-q4_0.bin'))
+        return model, tokenizer
+
     # Custom
     else:
         params = {"low_cpu_mem_usage": True}

From a5f5736e748bad56ebd9c9c88d1cfa6f3fde97db Mon Sep 17 00:00:00 2001
From: Thomas Antony <mail@thomasantony.com>
Date: Sun, 19 Mar 2023 19:51:43 -0700
Subject: [PATCH 04/11] Add to text_generation.py

---
 modules/text_generation.py | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/modules/text_generation.py b/modules/text_generation.py
index 7b5fcd6a..e18a76d7 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -22,7 +22,7 @@ def get_max_prompt_length(tokens):
     return max_length
 
 def encode(prompt, tokens_to_generate=0, add_special_tokens=True):
-    if shared.is_RWKV:
+    if shared.is_RWKV or shared.is_llamacpp:
         input_ids = shared.tokenizer.encode(str(prompt))
         input_ids = np.array(input_ids).reshape(1, len(input_ids))
         return input_ids
@@ -142,6 +142,24 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
             input_ids = encode(question)
             print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(input_ids[0])} tokens)")
             return
+    elif shared.is_llamacpp:
+        try:
+            if shared.args.no_stream:
+                reply = shared.model.generate(context=question, num_tokens=max_new_tokens)
+                yield formatted_outputs(reply, shared.model_name)
+            else:
+                if not (shared.args.chat or shared.args.cai_chat):
+                    yield formatted_outputs(question, shared.model_name)
+                for reply in shared.model.generate_with_streaming(context=question, num_tokens=max_new_tokens):
+                    yield formatted_outputs(reply, shared.model_name)
+        except Exception as e:
+            print(e)
+        finally:
+            t1 = time.time()
+            output = encode(reply)[0]
+            input_ids = encode(question)
+            print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(input_ids[0])} tokens)")
+            return
 
     input_ids = encode(question, max_new_tokens)
     original_input_ids = input_ids

From 8953a262cb25a0dc3d5c486aba0e3f4175d83ffb Mon Sep 17 00:00:00 2001
From: Thomas Antony <mail@thomasantony.com>
Date: Sun, 19 Mar 2023 19:59:25 -0700
Subject: [PATCH 05/11] Add llamacpp to requirements.txt

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index 79da715d..e92c6889 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,4 +11,5 @@ safetensors==0.3.0
 sentencepiece
 tqdm
 datasets
+llamacpp>=0.1.9
 git+https://github.com/huggingface/transformers

From 79fa2b6d7e338a61ed978fb1e5411838779e3761 Mon Sep 17 00:00:00 2001
From: Thomas Antony <mail@thomasantony.com>
Date: Sun, 19 Mar 2023 21:30:24 -0700
Subject: [PATCH 06/11] Add support for alpaca

---
 modules/models.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/modules/models.py b/modules/models.py
index c60af8e2..e9fed4a9 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -42,7 +42,8 @@ def load_model(model_name):
     t0 = time.time()
 
     shared.is_RWKV = 'rwkv-' in model_name.lower()
-    shared.is_llamacpp = model_name.lower().startswith('llamacpp-')
+    shared.is_llamacpp = model_name.lower().startswith('llamacpp-') or \
+                         model_name.lower().startswith('alpaca-cpp-')
 
     # Default settings
     if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.wbits, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.is_RWKV, shared.is_llamacpp]):
@@ -104,7 +105,13 @@ def load_model(model_name):
     # LLAMACPP model
     elif shared.is_llamacpp:
         from modules.llamacpp_model import LlamaCppModel
-        model, tokenizer = LlamaCppModel.from_pretrained(Path(f'models/{model_name}/ggml-model-q4_0.bin'))
+
+        if model_name.lower().startswith('alpaca-'):
+            model_file = f'models/{model_name}/ggml-alpaca-7b-q4.bin'
+        else:
+            model_file = f'models/{model_name}/ggml-model-q4_0.bin'
+
+        model, tokenizer = LlamaCppModel.from_pretrained(Path(model_file))
         return model, tokenizer
 
     # Custom

From 7fa5d96c220324c4b43dfe4dfdf1267137fc94cd Mon Sep 17 00:00:00 2001
From: Thomas Antony <mail@thomasantony.com>
Date: Wed, 29 Mar 2023 21:20:22 +0100
Subject: [PATCH 07/11] Update to use new llamacpp API

---
 modules/llamacpp_model.py | 39 +++++++++++++--------------------------
 1 file changed, 13 insertions(+), 26 deletions(-)

diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py
index 21415fa2..f65ecb4e 100644
--- a/modules/llamacpp_model.py
+++ b/modules/llamacpp_model.py
@@ -8,16 +8,16 @@ import llamacpp
 
 class LlamaCppTokenizer:
     """A thin wrapper over the llamacpp tokenizer"""
-    def __init__(self, model: llamacpp.PyLLAMA):
+    def __init__(self, model: llamacpp.LlamaInference):
         self._tokenizer = model.get_tokenizer()
         self.eos_token_id = 2
         self.bos_token_id = 0
 
     @classmethod
-    def from_model(cls, model: llamacpp.PyLLAMA):
+    def from_model(cls, model: llamacpp.LlamaInference):
         return cls(model)
 
-    def encode(self, prompt):
+    def encode(self, prompt: str):
         return self._tokenizer.tokenize(prompt)
 
     def decode(self, ids):
@@ -30,21 +30,10 @@ class LlamaCppModel:
 
     @classmethod
     def from_pretrained(self, path):
-        params = llamacpp.gpt_params(
-            str(path),  # model
-            2048,  # ctx_size
-            200,  # n_predict
-            40,  # top_k
-            0.95,  # top_p
-            0.80,  # temp
-            1.30,  # repeat_penalty
-            -1,  # seed
-            8,  # threads
-            64,  # repeat_last_n
-            8,  # batch_size
-        )
+        params = llamacpp.InferenceParams()
+        params.path_model = str(path)
 
-        _model = llamacpp.PyLLAMA(params)
+        _model = llamacpp.LlamaInference(params)
 
         result = self()
         result.model = _model
@@ -63,22 +52,20 @@ class LlamaCppModel:
         # params.repeat_last_n = repeat_last_n
 
         # model.params = params
-        if not self.initialized:
-            self.model.add_bos()
-
+        self.model.add_bos()
         self.model.update_input(context)
-        if not self.initialized:
-            self.model.prepare_context()
-            self.initialized = True
 
         output = ""
         is_end_of_text = False
         ctr = 0
-        while not self.model.is_finished() and ctr < num_tokens and not is_end_of_text:
+        while ctr < num_tokens and not is_end_of_text:
             if self.model.has_unconsumed_input():
-                self.model.ingest_all_pending_input(False)
+                self.model.ingest_all_pending_input()
             else:
-                text, is_end_of_text = self.model.infer_text()
+                self.model.eval()
+                token = self.model.sample()
+                text = self.model.token_to_str(token)
+                is_end_of_text = token == self.model.token_eos()
                 if callback:
                     callback(text)
                 output += text

From 9d1dcf880aa928524385d82baabc2ff262206f2e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 31 Mar 2023 14:27:01 -0300
Subject: [PATCH 08/11] General improvements

---
 modules/llamacpp_model.py  | 29 ++++++++++++++---------------
 modules/text_generation.py | 22 ++--------------------
 requirements.txt           |  2 +-
 3 files changed, 17 insertions(+), 36 deletions(-)

diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py
index f65ecb4e..6b9b1b52 100644
--- a/modules/llamacpp_model.py
+++ b/modules/llamacpp_model.py
@@ -1,10 +1,10 @@
-import os
 from pathlib import Path
-import modules.shared as shared
-from modules.callbacks import Iteratorize
 
 import llamacpp
 
+import modules.shared as shared
+from modules.callbacks import Iteratorize
+
 
 class LlamaCppTokenizer:
     """A thin wrapper over the llamacpp tokenizer"""
@@ -37,19 +37,19 @@ class LlamaCppModel:
 
         result = self()
         result.model = _model
+        result.params = params
 
         tokenizer = LlamaCppTokenizer.from_model(_model)
         return result, tokenizer
 
-    # TODO: Allow passing in params for each inference
-    def generate(self, context="", num_tokens=10, callback=None):
-        # params = self.params
-        # params.n_predict = token_count
-        # params.top_p = top_p
-        # params.top_k = top_k
-        # params.temp = temperature
-        # params.repeat_penalty = repetition_penalty
-        # params.repeat_last_n = repeat_last_n
+    def generate(self, context="", token_count=20, temperature=1, top_p=1, top_k=50, repetition_penalty=1, callback=None):
+        params = self.params
+        params.n_predict = token_count
+        params.top_p = top_p
+        params.top_k = top_k
+        params.temp = temperature
+        params.repeat_penalty = repetition_penalty
+        #params.repeat_last_n = repeat_last_n
 
         # model.params = params
         self.model.add_bos()
@@ -58,7 +58,7 @@ class LlamaCppModel:
         output = ""
         is_end_of_text = False
         ctr = 0
-        while ctr < num_tokens and not is_end_of_text:
+        while ctr < token_count and not is_end_of_text:
             if self.model.has_unconsumed_input():
                 self.model.ingest_all_pending_input()
             else:
@@ -68,14 +68,13 @@ class LlamaCppModel:
                 is_end_of_text = token == self.model.token_eos()
                 if callback:
                     callback(text)
-                output += text
                 ctr += 1
 
         return output
 
     def generate_with_streaming(self, **kwargs):
         with Iteratorize(self.generate, kwargs, callback=None) as generator:
-            reply = kwargs['context']
+            reply = ''
             for token in generator:
                 reply += token
                 yield reply
diff --git a/modules/text_generation.py b/modules/text_generation.py
index e18a76d7..8d54961e 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -22,7 +22,7 @@ def get_max_prompt_length(tokens):
     return max_length
 
 def encode(prompt, tokens_to_generate=0, add_special_tokens=True):
-    if shared.is_RWKV or shared.is_llamacpp:
+    if any((shared.is_RWKV, shared.is_llamacpp)):
         input_ids = shared.tokenizer.encode(str(prompt))
         input_ids = np.array(input_ids).reshape(1, len(input_ids))
         return input_ids
@@ -116,7 +116,7 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
 
     # These models are not part of Hugging Face, so we handle them
     # separately and terminate the function call earlier
-    if shared.is_RWKV:
+    if any((shared.is_RWKV, shared.is_llamacpp)):
         try:
             if shared.args.no_stream:
                 reply = shared.model.generate(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k)
@@ -142,24 +142,6 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
             input_ids = encode(question)
             print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(input_ids[0])} tokens)")
             return
-    elif shared.is_llamacpp:
-        try:
-            if shared.args.no_stream:
-                reply = shared.model.generate(context=question, num_tokens=max_new_tokens)
-                yield formatted_outputs(reply, shared.model_name)
-            else:
-                if not (shared.args.chat or shared.args.cai_chat):
-                    yield formatted_outputs(question, shared.model_name)
-                for reply in shared.model.generate_with_streaming(context=question, num_tokens=max_new_tokens):
-                    yield formatted_outputs(reply, shared.model_name)
-        except Exception as e:
-            print(e)
-        finally:
-            t1 = time.time()
-            output = encode(reply)[0]
-            input_ids = encode(question)
-            print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(input_ids[0])} tokens)")
-            return
 
     input_ids = encode(question, max_new_tokens)
     original_input_ids = input_ids
diff --git a/requirements.txt b/requirements.txt
index e92c6889..08ee5d58 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,6 +2,7 @@ accelerate==0.18.0
 bitsandbytes==0.37.2
 flexgen==0.1.7
 gradio==3.23.0
+llamacpp==0.1.10
 markdown
 numpy
 peft==0.2.0
@@ -11,5 +12,4 @@ safetensors==0.3.0
 sentencepiece
 tqdm
 datasets
-llamacpp>=0.1.9
 git+https://github.com/huggingface/transformers

From 4c275621572bc6719ebc30f715184bb4d5477e38 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 31 Mar 2023 14:33:46 -0300
Subject: [PATCH 09/11] Minor changes

---
 .gitignore        | 2 +-
 modules/models.py | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/.gitignore b/.gitignore
index aec1f1cf..bfb6d027 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,3 @@
-.vscode
 cache
 characters
 training/datasets
@@ -15,6 +14,7 @@ torch-dumps
 */*/pycache*
 venv/
 .venv/
+.vscode
 repositories
 
 settings.json
diff --git a/modules/models.py b/modules/models.py
index e9fed4a9..80bbcab2 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -42,8 +42,7 @@ def load_model(model_name):
     t0 = time.time()
 
     shared.is_RWKV = 'rwkv-' in model_name.lower()
-    shared.is_llamacpp = model_name.lower().startswith('llamacpp-') or \
-                         model_name.lower().startswith('alpaca-cpp-')
+    shared.is_llamacpp = model_name.lower().startswith(('llamacpp', 'alpaca-cpp'))
 
     # Default settings
     if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.wbits, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.is_RWKV, shared.is_llamacpp]):
@@ -102,11 +101,11 @@ def load_model(model_name):
 
         model = load_quantized(model_name)
 
-    # LLAMACPP model
+    # llamacpp model
     elif shared.is_llamacpp:
         from modules.llamacpp_model import LlamaCppModel
 
-        if model_name.lower().startswith('alpaca-'):
+        if model_name.lower().startswith('alpaca-cpp'):
             model_file = f'models/{model_name}/ggml-alpaca-7b-q4.bin'
         else:
             model_file = f'models/{model_name}/ggml-model-q4_0.bin'

From 09b0a3aafb1a3b2d86912db0114b84ad3bc6029a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 31 Mar 2023 14:45:17 -0300
Subject: [PATCH 10/11] Add repetition_penalty

---
 modules/RWKV.py            | 2 +-
 modules/text_generation.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/RWKV.py b/modules/RWKV.py
index 8c7ea2b9..10c4c366 100644
--- a/modules/RWKV.py
+++ b/modules/RWKV.py
@@ -34,7 +34,7 @@ class RWKVModel:
         result.pipeline = pipeline
         return result
 
-    def generate(self, context="", token_count=20, temperature=1, top_p=1, top_k=50, alpha_frequency=0.1, alpha_presence=0.1, token_ban=[0], token_stop=[], callback=None):
+    def generate(self, context="", token_count=20, temperature=1, top_p=1, top_k=50, repetition_penalty=None, alpha_frequency=0.1, alpha_presence=0.1, token_ban=[0], token_stop=[], callback=None):
         args = PIPELINE_ARGS(
             temperature = temperature,
             top_p = top_p,
diff --git a/modules/text_generation.py b/modules/text_generation.py
index 8d54961e..b8b2f496 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -119,7 +119,7 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
     if any((shared.is_RWKV, shared.is_llamacpp)):
         try:
             if shared.args.no_stream:
-                reply = shared.model.generate(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k)
+                reply = shared.model.generate(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k, repetition_penalty=repetition_penalty)
                 if not (shared.args.chat or shared.args.cai_chat):
                     reply = original_question + apply_extensions(reply, "output")
                 yield formatted_outputs(reply, shared.model_name)

From a5c9b7d97763acda0ebc4db84ec6a2adfc093106 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 31 Mar 2023 15:08:01 -0300
Subject: [PATCH 11/11] Bump llamacpp version

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 29ede4c6..ffa6b51a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,7 @@ accelerate==0.18.0
 bitsandbytes==0.37.2
 flexgen==0.1.7
 gradio==3.24.0
-llamacpp==0.1.10
+llamacpp==0.1.11
 markdown
 numpy
 peft==0.2.0