From e17af5926156c1f9f9d793b29bc360bece97a5a0 Mon Sep 17 00:00:00 2001
From: Nikita Skakun <nikita@skakun-family.com>
Date: Thu, 30 Mar 2023 00:21:34 -0700
Subject: [PATCH 01/42] Add support for resuming downloads

This commit adds the ability to resume interrupted downloads by adding a new function to the downloader module. The function uses the HTTP Range header to fetch only the remaining part of a file that wasn't downloaded yet.
---
 download-model.py | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/download-model.py b/download-model.py
index 7e5f61b2..94524f76 100644
--- a/download-model.py
+++ b/download-model.py
@@ -27,8 +27,23 @@ parser.add_argument('--output', type=str, default=None, help='The folder where t
 args = parser.parse_args()
 
 def get_file(url, output_folder):
-    r = requests.get(url, stream=True)
-    with open(output_folder / Path(url.rsplit('/', 1)[1]), 'wb') as f:
+    filename = Path(url.rsplit('/', 1)[1])
+    output_path = output_folder / filename
+    if output_path.exists():
+        # Check if the file has already been downloaded completely
+        r = requests.head(url)
+        total_size = int(r.headers.get('content-length', 0))
+        if output_path.stat().st_size == total_size:
+            return
+        # Otherwise, resume the download from where it left off
+        headers = {'Range': f'bytes={output_path.stat().st_size}-'}
+        mode = 'ab'
+    else:
+        headers = {}
+        mode = 'wb'
+
+    r = requests.get(url, stream=True, headers=headers)
+    with open(output_path, mode) as f:
         total_size = int(r.headers.get('content-length', 0))
         block_size = 1024
         with tqdm.tqdm(total=total_size, unit='iB', unit_scale=True, bar_format='{l_bar}{bar}| {n_fmt:6}/{total_fmt:6} {rate_fmt:6}') as t:
@@ -149,7 +164,7 @@ def get_download_links_from_huggingface(model, branch):
     return links, sha256, is_lora
 
 def download_files(file_list, output_folder, num_threads=8):
-    thread_map(lambda url: get_file(url, output_folder), file_list, max_workers=num_threads)
+    thread_map(lambda url: get_file(url, output_folder), file_list, max_workers=num_threads, disable=True)
 
 if __name__ == '__main__':
     model = args.MODEL

From 8c590c2362d4ee783b41a93f6c03f8e19dc40657 Mon Sep 17 00:00:00 2001
From: Nikita Skakun <nikita@skakun-family.com>
Date: Thu, 30 Mar 2023 00:42:19 -0700
Subject: [PATCH 02/42] Added a 'clean' flag to not resume download.

---
 download-model.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/download-model.py b/download-model.py
index 94524f76..ed8a047d 100644
--- a/download-model.py
+++ b/download-model.py
@@ -24,12 +24,13 @@ parser.add_argument('--branch', type=str, default='main', help='Name of the Git
 parser.add_argument('--threads', type=int, default=1, help='Number of files to download simultaneously.')
 parser.add_argument('--text-only', action='store_true', help='Only download text files (txt/json).')
 parser.add_argument('--output', type=str, default=None, help='The folder where the model should be saved.')
+parser.add_argument('--clean', action='store_true', help='Does not resume the previous download.')
 args = parser.parse_args()
 
 def get_file(url, output_folder):
     filename = Path(url.rsplit('/', 1)[1])
     output_path = output_folder / filename
-    if output_path.exists():
+    if output_path.exists() and not args.clean:
         # Check if the file has already been downloaded completely
         r = requests.head(url)
         total_size = int(r.headers.get('content-length', 0))

From 297ac051d91c52ee6e0a39bc745d84f85a305346 Mon Sep 17 00:00:00 2001
From: Nikita Skakun <nikita@skakun-family.com>
Date: Thu, 30 Mar 2023 02:34:19 -0700
Subject: [PATCH 03/42] Added sha256 validation of model files.

---
 download-model.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/download-model.py b/download-model.py
index ed8a047d..0b3f16d7 100644
--- a/download-model.py
+++ b/download-model.py
@@ -17,6 +17,7 @@ from pathlib import Path
 import requests
 import tqdm
 from tqdm.contrib.concurrent import thread_map
+import hashlib
 
 parser = argparse.ArgumentParser()
 parser.add_argument('MODEL', type=str, default=None, nargs='?')
@@ -213,4 +214,17 @@ if __name__ == '__main__':
     # Downloading the files
     print(f"Downloading the model to {output_folder}")
     download_files(links, output_folder, args.threads)
-    print()
+    
+    print('\n')
+    # Validate the checksums
+    validated = True
+    for i in range(len(sha256)):
+        with open(output_folder / sha256[i][0], "rb") as f:
+            bytes = f.read()
+            file_hash = hashlib.sha256(bytes).hexdigest()
+            if file_hash != sha256[i][1]:
+                print(f'[!] Checksum for {sha256[i][0]} failed!')
+                validated = False
+    
+    if validated:
+        print('[+] Validated checksums of all model files!')
\ No newline at end of file

From 53ab1e285d37340e660adff6a560f1b95463aa29 Mon Sep 17 00:00:00 2001
From: Thomas Antony <mail@thomasantony.com>
Date: Sun, 19 Mar 2023 19:52:08 -0700
Subject: [PATCH 04/42] Update .gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index a9c47a5a..aec1f1cf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
+.vscode
 cache
 characters
 training/datasets

From 7a562481fa3eb73455c7aabdf24f19673e13fc18 Mon Sep 17 00:00:00 2001
From: Thomas Antony <mail@thomasantony.com>
Date: Sat, 18 Mar 2023 23:42:10 -0700
Subject: [PATCH 05/42] Initial version of llamacpp_model.py

---
 modules/llamacpp_model.py | 94 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)
 create mode 100644 modules/llamacpp_model.py

diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py
new file mode 100644
index 00000000..21415fa2
--- /dev/null
+++ b/modules/llamacpp_model.py
@@ -0,0 +1,94 @@
+import os
+from pathlib import Path
+import modules.shared as shared
+from modules.callbacks import Iteratorize
+
+import llamacpp
+
+
+class LlamaCppTokenizer:
+    """A thin wrapper over the llamacpp tokenizer"""
+    def __init__(self, model: llamacpp.PyLLAMA):
+        self._tokenizer = model.get_tokenizer()
+        self.eos_token_id = 2
+        self.bos_token_id = 0
+
+    @classmethod
+    def from_model(cls, model: llamacpp.PyLLAMA):
+        return cls(model)
+
+    def encode(self, prompt):
+        return self._tokenizer.tokenize(prompt)
+
+    def decode(self, ids):
+        return self._tokenizer.detokenize(ids)
+
+
+class LlamaCppModel:
+    def __init__(self):
+        self.initialized = False
+
+    @classmethod
+    def from_pretrained(self, path):
+        params = llamacpp.gpt_params(
+            str(path),  # model
+            2048,  # ctx_size
+            200,  # n_predict
+            40,  # top_k
+            0.95,  # top_p
+            0.80,  # temp
+            1.30,  # repeat_penalty
+            -1,  # seed
+            8,  # threads
+            64,  # repeat_last_n
+            8,  # batch_size
+        )
+
+        _model = llamacpp.PyLLAMA(params)
+
+        result = self()
+        result.model = _model
+
+        tokenizer = LlamaCppTokenizer.from_model(_model)
+        return result, tokenizer
+
+    # TODO: Allow passing in params for each inference
+    def generate(self, context="", num_tokens=10, callback=None):
+        # params = self.params
+        # params.n_predict = token_count
+        # params.top_p = top_p
+        # params.top_k = top_k
+        # params.temp = temperature
+        # params.repeat_penalty = repetition_penalty
+        # params.repeat_last_n = repeat_last_n
+
+        # model.params = params
+        if not self.initialized:
+            self.model.add_bos()
+
+        self.model.update_input(context)
+        if not self.initialized:
+            self.model.prepare_context()
+            self.initialized = True
+
+        output = ""
+        is_end_of_text = False
+        ctr = 0
+        while not self.model.is_finished() and ctr < num_tokens and not is_end_of_text:
+            if self.model.has_unconsumed_input():
+                self.model.ingest_all_pending_input(False)
+            else:
+                text, is_end_of_text = self.model.infer_text()
+                if callback:
+                    callback(text)
+                output += text
+                ctr += 1
+
+        return output
+
+    def generate_with_streaming(self, **kwargs):
+        with Iteratorize(self.generate, kwargs, callback=None) as generator:
+            reply = kwargs['context']
+            for token in generator:
+                reply += token
+                yield reply

From 7745faa7bb39c8f925d6b34d4a61c0a0778e13c0 Mon Sep 17 00:00:00 2001
From: Thomas Antony <mail@thomasantony.com>
Date: Sat, 18 Mar 2023 23:42:28 -0700
Subject: [PATCH 06/42] Add llamacpp to models.py

---
 modules/models.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/modules/models.py b/modules/models.py
index b19507db..c60af8e2 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -42,9 +42,10 @@ def load_model(model_name):
     t0 = time.time()
 
     shared.is_RWKV = 'rwkv-' in model_name.lower()
+    shared.is_llamacpp = model_name.lower().startswith('llamacpp-')
 
     # Default settings
-    if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.wbits, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.is_RWKV]):
+    if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.wbits, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.is_RWKV, shared.is_llamacpp]):
         if any(size in shared.model_name.lower() for size in ('13b', '20b', '30b')):
             model = AutoModelForCausalLM.from_pretrained(Path(f"{shared.args.model_dir}/{shared.model_name}"), device_map='auto', load_in_8bit=True)
         else:
@@ -100,6 +101,12 @@ def load_model(model_name):
 
         model = load_quantized(model_name)
 
+    # LLAMACPP model
+    elif shared.is_llamacpp:
+        from modules.llamacpp_model import LlamaCppModel
+        model, tokenizer = LlamaCppModel.from_pretrained(Path(f'models/{model_name}/ggml-model-q4_0.bin'))
+        return model, tokenizer
+
     # Custom
     else:
         params = {"low_cpu_mem_usage": True}

From a5f5736e748bad56ebd9c9c88d1cfa6f3fde97db Mon Sep 17 00:00:00 2001
From: Thomas Antony <mail@thomasantony.com>
Date: Sun, 19 Mar 2023 19:51:43 -0700
Subject: [PATCH 07/42] Add to text_generation.py

---
 modules/text_generation.py | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/modules/text_generation.py b/modules/text_generation.py
index 7b5fcd6a..e18a76d7 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -22,7 +22,7 @@ def get_max_prompt_length(tokens):
     return max_length
 
 def encode(prompt, tokens_to_generate=0, add_special_tokens=True):
-    if shared.is_RWKV:
+    if shared.is_RWKV or shared.is_llamacpp:
         input_ids = shared.tokenizer.encode(str(prompt))
         input_ids = np.array(input_ids).reshape(1, len(input_ids))
         return input_ids
@@ -142,6 +142,24 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
             input_ids = encode(question)
             print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(input_ids[0])} tokens)")
             return
+    elif shared.is_llamacpp:
+        try:
+            if shared.args.no_stream:
+                reply = shared.model.generate(context=question, num_tokens=max_new_tokens)
+                yield formatted_outputs(reply, shared.model_name)
+            else:
+                if not (shared.args.chat or shared.args.cai_chat):
+                    yield formatted_outputs(question, shared.model_name)
+                for reply in shared.model.generate_with_streaming(context=question, num_tokens=max_new_tokens):
+                    yield formatted_outputs(reply, shared.model_name)
+        except Exception as e:
+            print(e)
+        finally:
+            t1 = time.time()
+            output = encode(reply)[0]
+            input_ids = encode(question)
+            print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(input_ids[0])} tokens)")
+            return
 
     input_ids = encode(question, max_new_tokens)
     original_input_ids = input_ids

From 8953a262cb25a0dc3d5c486aba0e3f4175d83ffb Mon Sep 17 00:00:00 2001
From: Thomas Antony <mail@thomasantony.com>
Date: Sun, 19 Mar 2023 19:59:25 -0700
Subject: [PATCH 08/42] Add llamacpp to requirements.txt

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index 79da715d..e92c6889 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,4 +11,5 @@ safetensors==0.3.0
 sentencepiece
 tqdm
 datasets
+llamacpp>=0.1.9
 git+https://github.com/huggingface/transformers

From 79fa2b6d7e338a61ed978fb1e5411838779e3761 Mon Sep 17 00:00:00 2001
From: Thomas Antony <mail@thomasantony.com>
Date: Sun, 19 Mar 2023 21:30:24 -0700
Subject: [PATCH 09/42] Add support for alpaca

---
 modules/models.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/modules/models.py b/modules/models.py
index c60af8e2..e9fed4a9 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -42,7 +42,8 @@ def load_model(model_name):
     t0 = time.time()
 
     shared.is_RWKV = 'rwkv-' in model_name.lower()
-    shared.is_llamacpp = model_name.lower().startswith('llamacpp-')
+    shared.is_llamacpp = model_name.lower().startswith('llamacpp-') or \
+                         model_name.lower().startswith('alpaca-cpp-')
 
     # Default settings
     if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.wbits, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.is_RWKV, shared.is_llamacpp]):
@@ -104,7 +105,13 @@ def load_model(model_name):
     # LLAMACPP model
     elif shared.is_llamacpp:
         from modules.llamacpp_model import LlamaCppModel
-        model, tokenizer = LlamaCppModel.from_pretrained(Path(f'models/{model_name}/ggml-model-q4_0.bin'))
+
+        if model_name.lower().startswith('alpaca-'):
+            model_file = f'models/{model_name}/ggml-alpaca-7b-q4.bin'
+        else:
+            model_file = f'models/{model_name}/ggml-model-q4_0.bin'
+
+        model, tokenizer = LlamaCppModel.from_pretrained(Path(model_file))
         return model, tokenizer
 
     # Custom

From 7fa5d96c220324c4b43dfe4dfdf1267137fc94cd Mon Sep 17 00:00:00 2001
From: Thomas Antony <mail@thomasantony.com>
Date: Wed, 29 Mar 2023 21:20:22 +0100
Subject: [PATCH 10/42] Update to use new llamacpp API

---
 modules/llamacpp_model.py | 39 +++++++++++++--------------------------
 1 file changed, 13 insertions(+), 26 deletions(-)

diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py
index 21415fa2..f65ecb4e 100644
--- a/modules/llamacpp_model.py
+++ b/modules/llamacpp_model.py
@@ -8,16 +8,16 @@ import llamacpp
 
 class LlamaCppTokenizer:
     """A thin wrapper over the llamacpp tokenizer"""
-    def __init__(self, model: llamacpp.PyLLAMA):
+    def __init__(self, model: llamacpp.LlamaInference):
         self._tokenizer = model.get_tokenizer()
         self.eos_token_id = 2
         self.bos_token_id = 0
 
     @classmethod
-    def from_model(cls, model: llamacpp.PyLLAMA):
+    def from_model(cls, model: llamacpp.LlamaInference):
         return cls(model)
 
-    def encode(self, prompt):
+    def encode(self, prompt: str):
         return self._tokenizer.tokenize(prompt)
 
     def decode(self, ids):
@@ -30,21 +30,10 @@ class LlamaCppModel:
 
     @classmethod
     def from_pretrained(self, path):
-        params = llamacpp.gpt_params(
-            str(path),  # model
-            2048,  # ctx_size
-            200,  # n_predict
-            40,  # top_k
-            0.95,  # top_p
-            0.80,  # temp
-            1.30,  # repeat_penalty
-            -1,  # seed
-            8,  # threads
-            64,  # repeat_last_n
-            8,  # batch_size
-        )
+        params = llamacpp.InferenceParams()
+        params.path_model = str(path)
 
-        _model = llamacpp.PyLLAMA(params)
+        _model = llamacpp.LlamaInference(params)
 
         result = self()
         result.model = _model
@@ -63,22 +52,20 @@ class LlamaCppModel:
         # params.repeat_last_n = repeat_last_n
 
         # model.params = params
-        if not self.initialized:
-            self.model.add_bos()
-
+        self.model.add_bos()
         self.model.update_input(context)
-        if not self.initialized:
-            self.model.prepare_context()
-            self.initialized = True
 
         output = ""
         is_end_of_text = False
         ctr = 0
-        while not self.model.is_finished() and ctr < num_tokens and not is_end_of_text:
+        while ctr < num_tokens and not is_end_of_text:
             if self.model.has_unconsumed_input():
-                self.model.ingest_all_pending_input(False)
+                self.model.ingest_all_pending_input()
             else:
-                text, is_end_of_text = self.model.infer_text()
+                self.model.eval()
+                token = self.model.sample()
+                text = self.model.token_to_str(token)
+                is_end_of_text = token == self.model.token_eos()
                 if callback:
                     callback(text)
                 output += text

From d550c12a3eda0aeceacc8013f9f77808d9f524be Mon Sep 17 00:00:00 2001
From: Nikita Skakun <nikita@skakun-family.com>
Date: Thu, 30 Mar 2023 12:52:16 -0700
Subject: [PATCH 11/42] Fixed the bug with additional bytes.

The issue seems to be with huggingface not reporting the entire size of the model.
Added an error message with instructions if the checksums don't match.
---
 download-model.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/download-model.py b/download-model.py
index 0b3f16d7..52cdae81 100644
--- a/download-model.py
+++ b/download-model.py
@@ -35,7 +35,7 @@ def get_file(url, output_folder):
         # Check if the file has already been downloaded completely
         r = requests.head(url)
         total_size = int(r.headers.get('content-length', 0))
-        if output_path.stat().st_size == total_size:
+        if output_path.stat().st_size >= total_size:
             return
         # Otherwise, resume the download from where it left off
         headers = {'Range': f'bytes={output_path.stat().st_size}-'}
@@ -215,7 +215,6 @@ if __name__ == '__main__':
     print(f"Downloading the model to {output_folder}")
     download_files(links, output_folder, args.threads)
     
-    print('\n')
     # Validate the checksums
     validated = True
     for i in range(len(sha256)):
@@ -227,4 +226,6 @@ if __name__ == '__main__':
                 validated = False
     
     if validated:
-        print('[+] Validated checksums of all model files!')
\ No newline at end of file
+        print('[+] Validated checksums of all model files!')
+    else:
+        print('[-] Rerun the download-model.py with --clean flag')
\ No newline at end of file

From bb69e054a7241798a8bc0c784747653aa753daab Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 30 Mar 2023 21:08:50 -0300
Subject: [PATCH 12/42] Add dummy file

---
 loras/place-your-loras-here.txt | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 loras/place-your-loras-here.txt

diff --git a/loras/place-your-loras-here.txt b/loras/place-your-loras-here.txt
new file mode 100644
index 00000000..e69de29b

From f9940b79dc9f67477cec3819dbfb250f5e92a658 Mon Sep 17 00:00:00 2001
From: ye7iaserag <yehia_serag@yahoo.com>
Date: Fri, 31 Mar 2023 04:56:49 +0200
Subject: [PATCH 13/42] Implement character gallery using Dataset

---
 extensions/gallery/script.py | 38 +++++++++++++++++++++++++-----------
 1 file changed, 27 insertions(+), 11 deletions(-)

diff --git a/extensions/gallery/script.py b/extensions/gallery/script.py
index fbf23bc9..e96fe40a 100644
--- a/extensions/gallery/script.py
+++ b/extensions/gallery/script.py
@@ -1,20 +1,23 @@
 from pathlib import Path
-
 import gradio as gr
-
 from modules.html_generator import get_image_cache
+from modules.chat import load_character
+from modules.shared import gradio, settings
 
-
-def generate_html():
+def generate_css():
     css = """
-      .character-gallery {
+      .character-gallery > .gallery {
         margin: 1rem 0;
-        display: grid;
+        display: grid !important;
         grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
         grid-column-gap: 0.4rem;
         grid-row-gap: 1.2rem;
       }
 
+      .character-gallery > .label {
+        display: none !important;
+      }
+
       .character-container {
         cursor: pointer;
         text-align: center;
@@ -45,14 +48,16 @@ def generate_html():
         overflow-wrap: anywhere;
       }
     """
+    return css
 
-    container_html = f'<style>{css}</style><div class="character-gallery">'
 
+def generate_html():
+    cards = []
     # Iterate through files in image folder
     for file in sorted(Path("characters").glob("*")):
         if file.name.endswith(".json"):
             character = file.name.replace(".json", "")
-            container_html += f'<div class="character-container" onclick=\'document.getElementById("character-menu").children[1].children[1].value = "{character}"; document.getElementById("character-menu").children[1].children[1].dispatchEvent(new Event("change"));\'>'
+            container_html = f'<div class="character-container">'
             image_html = "<div class='placeholder'></div>"
 
             for i in [
@@ -71,12 +76,23 @@ def generate_html():
 
             container_html += f'{image_html} <span class="character-name">{character}</span>'
             container_html += "</div>"
+            cards.append([container_html, character])
+
+    return cards
+
+
+def select_character(evt: gr.SelectData):
+    return (evt.value[1])
 
-    container_html += "</div>"
-    return container_html
 
 def ui():
     with gr.Accordion("Character gallery", open=False):
         update = gr.Button("Refresh")
-        gallery = gr.HTML(value=generate_html())
+        gr.HTML(value="<style>"+generate_css()+"</style>")
+        gallery = gr.Dataset(components=[gr.HTML(visible=False)],
+            label="",
+            samples=generate_html(),
+            elem_classes=["character-gallery"]
+        )
     update.click(generate_html, [], gallery)
+    gallery.select(select_character, None, gradio['character_menu'])
\ No newline at end of file

From 0cc89e7755512242784fc4ed5a8c0c64d6bf689e Mon Sep 17 00:00:00 2001
From: Nikita Skakun <nikita@skakun-family.com>
Date: Thu, 30 Mar 2023 20:06:12 -0700
Subject: [PATCH 14/42] Checksum code now activated by --check flag.

---
 download-model.py | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/download-model.py b/download-model.py
index 52cdae81..b2788a60 100644
--- a/download-model.py
+++ b/download-model.py
@@ -26,6 +26,7 @@ parser.add_argument('--threads', type=int, default=1, help='Number of files to d
 parser.add_argument('--text-only', action='store_true', help='Only download text files (txt/json).')
 parser.add_argument('--output', type=str, default=None, help='The folder where the model should be saved.')
 parser.add_argument('--clean', action='store_true', help='Does not resume the previous download.')
+parser.add_argument('--check', action='store_true', help='Validates the checksums of model files.')
 args = parser.parse_args()
 
 def get_file(url, output_folder):
@@ -215,17 +216,18 @@ if __name__ == '__main__':
     print(f"Downloading the model to {output_folder}")
     download_files(links, output_folder, args.threads)
     
-    # Validate the checksums
-    validated = True
-    for i in range(len(sha256)):
-        with open(output_folder / sha256[i][0], "rb") as f:
-            bytes = f.read()
-            file_hash = hashlib.sha256(bytes).hexdigest()
-            if file_hash != sha256[i][1]:
-                print(f'[!] Checksum for {sha256[i][0]} failed!')
-                validated = False
-    
-    if validated:
-        print('[+] Validated checksums of all model files!')
-    else:
-        print('[-] Rerun the download-model.py with --clean flag')
\ No newline at end of file
+    if args.check:
+        # Validate the checksums
+        validated = True
+        for i in range(len(sha256)):
+            with open(output_folder / sha256[i][0], "rb") as f:
+                bytes = f.read()
+                file_hash = hashlib.sha256(bytes).hexdigest()
+                if file_hash != sha256[i][1]:
+                    print(f'[!] Checksum for {sha256[i][0]} failed!')
+                    validated = False
+        
+        if validated:
+            print('[+] Validated checksums of all model files!')
+        else:
+            print('[-] Rerun the download-model.py with --clean flag')
\ No newline at end of file

From f27a66b01433d94bd973c7ce89f0e963db674b6c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 31 Mar 2023 00:42:26 -0300
Subject: [PATCH 15/42] Bump gradio version (make sure to update)

This fixes the textbox shrinking vertically once it reaches
a certain number of lines.
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 79da715d..8d22d41f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 accelerate==0.18.0
 bitsandbytes==0.37.2
 flexgen==0.1.7
-gradio==3.23.0
+gradio==3.24.0
 markdown
 numpy
 peft==0.2.0

From fd72afd8e701cec8888d7fb1438174a5b02b0cc4 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 31 Mar 2023 00:43:00 -0300
Subject: [PATCH 16/42] Increase the textbox sizes

---
 server.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/server.py b/server.py
index 27223f84..8cbe1f7c 100644
--- a/server.py
+++ b/server.py
@@ -408,7 +408,7 @@ def create_interface():
                 with gr.Row():
                     with gr.Column(scale=4):
                         with gr.Tab('Raw'):
-                            shared.gradio['textbox'] = gr.Textbox(value=default_text, elem_id="textbox", lines=25)
+                            shared.gradio['textbox'] = gr.Textbox(value=default_text, elem_id="textbox", lines=27)
                         with gr.Tab('Markdown'):
                             shared.gradio['markdown'] = gr.Markdown()
                         with gr.Tab('HTML'):
@@ -442,7 +442,7 @@ def create_interface():
             with gr.Tab("Text generation", elem_id="main"):
                 with gr.Row():
                     with gr.Column():
-                        shared.gradio['textbox'] = gr.Textbox(value=default_text, lines=15, label='Input')
+                        shared.gradio['textbox'] = gr.Textbox(value=default_text, lines=21, label='Input')
                         shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], step=1, label='max_new_tokens', value=shared.settings['max_new_tokens'])
                         shared.gradio['Generate'] = gr.Button('Generate')
                         with gr.Row():
@@ -455,7 +455,7 @@ def create_interface():
 
                     with gr.Column():
                         with gr.Tab('Raw'):
-                            shared.gradio['output_textbox'] = gr.Textbox(lines=25, label='Output')
+                            shared.gradio['output_textbox'] = gr.Textbox(lines=27, label='Output')
                         with gr.Tab('Markdown'):
                             shared.gradio['markdown'] = gr.Markdown()
                         with gr.Tab('HTML'):

From 3737eafeaa1a5df97456f04b665e45e1586677a4 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 31 Mar 2023 00:48:50 -0300
Subject: [PATCH 17/42] Remove a border and allow more characters per
 pagination page

---
 css/chat.css                 | 4 ++++
 extensions/gallery/script.py | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/css/chat.css b/css/chat.css
index c8a9d70a..2015a80a 100644
--- a/css/chat.css
+++ b/css/chat.css
@@ -36,3 +36,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 .wrap.svelte-6roggh.svelte-6roggh {
   max-height: 92.5%;
 }
+
+.gallery-item.svelte-13hsdno.svelte-13hsdno.svelte-13hsdno {
+  border: 0 !important;
+}
diff --git a/extensions/gallery/script.py b/extensions/gallery/script.py
index e96fe40a..1954e3d0 100644
--- a/extensions/gallery/script.py
+++ b/extensions/gallery/script.py
@@ -92,7 +92,8 @@ def ui():
         gallery = gr.Dataset(components=[gr.HTML(visible=False)],
             label="",
             samples=generate_html(),
-            elem_classes=["character-gallery"]
+            elem_classes=["character-gallery"],
+            samples_per_page=50
         )
     update.click(generate_html, [], gallery)
     gallery.select(select_character, None, gradio['character_menu'])
\ No newline at end of file

From 92c7068daff5655190e448705cdf498904873e93 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 31 Mar 2023 01:31:47 -0300
Subject: [PATCH 18/42] Don't download if --check is specified

---
 download-model.py | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/download-model.py b/download-model.py
index b2788a60..8e3f2e8b 100644
--- a/download-model.py
+++ b/download-model.py
@@ -9,6 +9,7 @@ python download-model.py facebook/opt-1.3b
 import argparse
 import base64
 import datetime
+import hashlib
 import json
 import re
 import sys
@@ -17,7 +18,6 @@ from pathlib import Path
 import requests
 import tqdm
 from tqdm.contrib.concurrent import thread_map
-import hashlib
 
 parser = argparse.ArgumentParser()
 parser.add_argument('MODEL', type=str, default=None, nargs='?')
@@ -212,22 +212,32 @@ if __name__ == '__main__':
         if sha256_str != '':
             f.write(f'sha256sum:\n{sha256_str}')
 
-    # Downloading the files
-    print(f"Downloading the model to {output_folder}")
-    download_files(links, output_folder, args.threads)
-    
     if args.check:
         # Validate the checksums
         validated = True
         for i in range(len(sha256)):
+            fpath = (output_folder / sha256[i][0])
+
+            if not fpath.exists():
+                print(f"The following file is missing: {fpath}")
+                validated = False
+                continue
+
             with open(output_folder / sha256[i][0], "rb") as f:
                 bytes = f.read()
                 file_hash = hashlib.sha256(bytes).hexdigest()
                 if file_hash != sha256[i][1]:
-                    print(f'[!] Checksum for {sha256[i][0]} failed!')
+                    print(f'Checksum failed: {sha256[i][0]}  {sha256[i][1]}')
                     validated = False
+                else:
+                    print(f'Checksum validated: {sha256[i][0]}  {sha256[i][1]}')
         
         if validated:
             print('[+] Validated checksums of all model files!')
         else:
-            print('[-] Rerun the download-model.py with --clean flag')
\ No newline at end of file
+            print('[-] Invalid checksums. Rerun download-model.py with the --clean flag.')
+
+    else:
+        # Downloading the files
+        print(f"Downloading the model to {output_folder}")
+        download_files(links, output_folder, args.threads)
\ No newline at end of file

From ec093a5af76e5a76327ae983b23805f09e8aaac8 Mon Sep 17 00:00:00 2001
From: ye7iaserag <yehia_serag@yahoo.com>
Date: Fri, 31 Mar 2023 06:54:24 +0200
Subject: [PATCH 19/42] Fix div alignment for long strings

---
 extensions/gallery/script.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/extensions/gallery/script.py b/extensions/gallery/script.py
index 1954e3d0..15d740ca 100644
--- a/extensions/gallery/script.py
+++ b/extensions/gallery/script.py
@@ -18,6 +18,10 @@ def generate_css():
         display: none !important;
       }
 
+      .character-gallery button.gallery-item {
+        display: contents;
+      }
+
       .character-container {
         cursor: pointer;
         text-align: center;

From d28a5c9569eeab5ac3c3e9f17802fa20241b9664 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 31 Mar 2023 02:01:13 -0300
Subject: [PATCH 20/42] Remove unnecessary css

---
 css/chat.css | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/css/chat.css b/css/chat.css
index 2015a80a..c8a9d70a 100644
--- a/css/chat.css
+++ b/css/chat.css
@@ -36,7 +36,3 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 .wrap.svelte-6roggh.svelte-6roggh {
   max-height: 92.5%;
 }
-
-.gallery-item.svelte-13hsdno.svelte-13hsdno.svelte-13hsdno {
-  border: 0 !important;
-}

From 3b90d604d75255708bfba7d7b92f492d51dc4ccf Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 31 Mar 2023 02:01:48 -0300
Subject: [PATCH 21/42] Sort the imports

---
 extensions/gallery/script.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/extensions/gallery/script.py b/extensions/gallery/script.py
index 15d740ca..c17d69ee 100644
--- a/extensions/gallery/script.py
+++ b/extensions/gallery/script.py
@@ -1,9 +1,12 @@
 from pathlib import Path
+
 import gradio as gr
-from modules.html_generator import get_image_cache
+
 from modules.chat import load_character
+from modules.html_generator import get_image_cache
 from modules.shared import gradio, settings
 
+
 def generate_css():
     css = """
       .character-gallery > .gallery {

From b99bea3c69d997d7d8988b2fcd5def1173e306ea Mon Sep 17 00:00:00 2001
From: Nikita Skakun <nikita@skakun-family.com>
Date: Thu, 30 Mar 2023 23:11:59 -0700
Subject: [PATCH 22/42] Fixed reported header affecting resuming download

---
 download-model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/download-model.py b/download-model.py
index 8e3f2e8b..75b031bd 100644
--- a/download-model.py
+++ b/download-model.py
@@ -34,7 +34,7 @@ def get_file(url, output_folder):
     output_path = output_folder / filename
     if output_path.exists() and not args.clean:
         # Check if the file has already been downloaded completely
-        r = requests.head(url)
+        r = requests.get(url, stream=True)
         total_size = int(r.headers.get('content-length', 0))
         if output_path.stat().st_size >= total_size:
             return

From b246d17513465e3bc3718c7f7fd7269b575c608d Mon Sep 17 00:00:00 2001
From: Maya <48323879+mayaeary@users.noreply.github.com>
Date: Fri, 31 Mar 2023 14:20:31 +0300
Subject: [PATCH 23/42] Fix `type object is not subscriptable`

Fix `type object is not subscriptable` on python 3.8
---
 modules/callbacks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/callbacks.py b/modules/callbacks.py
index aa92f9cb..945b8c37 100644
--- a/modules/callbacks.py
+++ b/modules/callbacks.py
@@ -12,7 +12,7 @@ import modules.shared as shared
 # Copied from https://github.com/PygmalionAI/gradio-ui/
 class _SentinelTokenStoppingCriteria(transformers.StoppingCriteria):
 
-    def __init__(self, sentinel_token_ids: list[torch.LongTensor], starting_idx: int):
+    def __init__(self, sentinel_token_ids: list, starting_idx: int):
         transformers.StoppingCriteria.__init__(self)
         self.sentinel_token_ids = sentinel_token_ids
         self.starting_idx = starting_idx

From 5a6f939f0566dd33c51cfa209938cda98ff47a78 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 31 Mar 2023 10:43:05 -0300
Subject: [PATCH 24/42] Change the preset here too

---
 settings-template.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/settings-template.json b/settings-template.json
index da767cda..047373f2 100644
--- a/settings-template.json
+++ b/settings-template.json
@@ -18,7 +18,7 @@
     ],
     "presets": {
         "default": "NovelAI-Sphinx Moth",
-        ".*pygmalion": "Pygmalion",
+        ".*pygmalion": "NovelAI-Storywriter",
         ".*RWKV": "Naive"
     },
     "prompts": {

From 1d1d9e40cdfe787e7ac6a1cb499f4cd64af2e680 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 31 Mar 2023 12:22:07 -0300
Subject: [PATCH 25/42] Add seed to settings

---
 modules/shared.py      | 1 +
 server.py              | 2 +-
 settings-template.json | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/modules/shared.py b/modules/shared.py
index 51fdf36c..6f3396a7 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -27,6 +27,7 @@ settings = {
     'max_new_tokens': 200,
     'max_new_tokens_min': 1,
     'max_new_tokens_max': 2000,
+    'seed': -1,
     'name1': 'You',
     'name2': 'Assistant',
     'context': 'This is a conversation with your Assistant. The Assistant is very helpful and is eager to chat with you and answer your questions.',
diff --git a/server.py b/server.py
index 8cbe1f7c..225a1f4c 100644
--- a/server.py
+++ b/server.py
@@ -166,7 +166,7 @@ def create_settings_menus(default_preset):
         with gr.Column():
             create_model_and_preset_menus()
         with gr.Column():
-            shared.gradio['seed'] = gr.Number(value=-1, label='Seed (-1 for random)')
+            shared.gradio['seed'] = gr.Number(value=shared.settings['seed'], label='Seed (-1 for random)')
 
     with gr.Row():
         with gr.Column():
diff --git a/settings-template.json b/settings-template.json
index 047373f2..4ce0ca7a 100644
--- a/settings-template.json
+++ b/settings-template.json
@@ -2,6 +2,7 @@
     "max_new_tokens": 200,
     "max_new_tokens_min": 1,
     "max_new_tokens_max": 2000,
+    "seed": -1,
     "name1": "You",
     "name2": "Assistant",
     "context": "This is a conversation with your Assistant. The Assistant is very helpful and is eager to chat with you and answer your questions.",

From 9d1dcf880aa928524385d82baabc2ff262206f2e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 31 Mar 2023 14:27:01 -0300
Subject: [PATCH 26/42] General improvements

---
 modules/llamacpp_model.py  | 29 ++++++++++++++---------------
 modules/text_generation.py | 22 ++--------------------
 requirements.txt           |  2 +-
 3 files changed, 17 insertions(+), 36 deletions(-)

diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py
index f65ecb4e..6b9b1b52 100644
--- a/modules/llamacpp_model.py
+++ b/modules/llamacpp_model.py
@@ -1,10 +1,10 @@
-import os
 from pathlib import Path
-import modules.shared as shared
-from modules.callbacks import Iteratorize
 
 import llamacpp
 
+import modules.shared as shared
+from modules.callbacks import Iteratorize
+
 
 class LlamaCppTokenizer:
     """A thin wrapper over the llamacpp tokenizer"""
@@ -37,19 +37,19 @@ class LlamaCppModel:
 
         result = self()
         result.model = _model
+        result.params = params
 
         tokenizer = LlamaCppTokenizer.from_model(_model)
         return result, tokenizer
 
-    # TODO: Allow passing in params for each inference
-    def generate(self, context="", num_tokens=10, callback=None):
-        # params = self.params
-        # params.n_predict = token_count
-        # params.top_p = top_p
-        # params.top_k = top_k
-        # params.temp = temperature
-        # params.repeat_penalty = repetition_penalty
-        # params.repeat_last_n = repeat_last_n
+    def generate(self, context="", token_count=20, temperature=1, top_p=1, top_k=50, repetition_penalty=1, callback=None):
+        params = self.params
+        params.n_predict = token_count
+        params.top_p = top_p
+        params.top_k = top_k
+        params.temp = temperature
+        params.repeat_penalty = repetition_penalty
+        #params.repeat_last_n = repeat_last_n
 
         # model.params = params
         self.model.add_bos()
@@ -58,7 +58,7 @@ class LlamaCppModel:
         output = ""
         is_end_of_text = False
         ctr = 0
-        while ctr < num_tokens and not is_end_of_text:
+        while ctr < token_count and not is_end_of_text:
             if self.model.has_unconsumed_input():
                 self.model.ingest_all_pending_input()
             else:
@@ -68,14 +68,13 @@ class LlamaCppModel:
                 is_end_of_text = token == self.model.token_eos()
                 if callback:
                     callback(text)
-                output += text
                 ctr += 1
 
         return output
 
     def generate_with_streaming(self, **kwargs):
         with Iteratorize(self.generate, kwargs, callback=None) as generator:
-            reply = kwargs['context']
+            reply = ''
             for token in generator:
                 reply += token
                 yield reply
diff --git a/modules/text_generation.py b/modules/text_generation.py
index e18a76d7..8d54961e 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -22,7 +22,7 @@ def get_max_prompt_length(tokens):
     return max_length
 
 def encode(prompt, tokens_to_generate=0, add_special_tokens=True):
-    if shared.is_RWKV or shared.is_llamacpp:
+    if any((shared.is_RWKV, shared.is_llamacpp)):
         input_ids = shared.tokenizer.encode(str(prompt))
         input_ids = np.array(input_ids).reshape(1, len(input_ids))
         return input_ids
@@ -116,7 +116,7 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
 
     # These models are not part of Hugging Face, so we handle them
     # separately and terminate the function call earlier
-    if shared.is_RWKV:
+    if any((shared.is_RWKV, shared.is_llamacpp)):
         try:
             if shared.args.no_stream:
                 reply = shared.model.generate(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k)
@@ -142,24 +142,6 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
             input_ids = encode(question)
             print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(input_ids[0])} tokens)")
             return
-    elif shared.is_llamacpp:
-        try:
-            if shared.args.no_stream:
-                reply = shared.model.generate(context=question, num_tokens=max_new_tokens)
-                yield formatted_outputs(reply, shared.model_name)
-            else:
-                if not (shared.args.chat or shared.args.cai_chat):
-                    yield formatted_outputs(question, shared.model_name)
-                for reply in shared.model.generate_with_streaming(context=question, num_tokens=max_new_tokens):
-                    yield formatted_outputs(reply, shared.model_name)
-        except Exception as e:
-            print(e)
-        finally:
-            t1 = time.time()
-            output = encode(reply)[0]
-            input_ids = encode(question)
-            print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(input_ids[0])} tokens)")
-            return
 
     input_ids = encode(question, max_new_tokens)
     original_input_ids = input_ids
diff --git a/requirements.txt b/requirements.txt
index e92c6889..08ee5d58 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,6 +2,7 @@ accelerate==0.18.0
 bitsandbytes==0.37.2
 flexgen==0.1.7
 gradio==3.23.0
+llamacpp==0.1.10
 markdown
 numpy
 peft==0.2.0
@@ -11,5 +12,4 @@ safetensors==0.3.0
 sentencepiece
 tqdm
 datasets
-llamacpp>=0.1.9
 git+https://github.com/huggingface/transformers

From 4c275621572bc6719ebc30f715184bb4d5477e38 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 31 Mar 2023 14:33:46 -0300
Subject: [PATCH 27/42] Minor changes

---
 .gitignore        | 2 +-
 modules/models.py | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/.gitignore b/.gitignore
index aec1f1cf..bfb6d027 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,3 @@
-.vscode
 cache
 characters
 training/datasets
@@ -15,6 +14,7 @@ torch-dumps
 */*/pycache*
 venv/
 .venv/
+.vscode
 repositories
 
 settings.json
diff --git a/modules/models.py b/modules/models.py
index e9fed4a9..80bbcab2 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -42,8 +42,7 @@ def load_model(model_name):
     t0 = time.time()
 
     shared.is_RWKV = 'rwkv-' in model_name.lower()
-    shared.is_llamacpp = model_name.lower().startswith('llamacpp-') or \
-                         model_name.lower().startswith('alpaca-cpp-')
+    shared.is_llamacpp = model_name.lower().startswith(('llamacpp', 'alpaca-cpp'))
 
     # Default settings
     if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.wbits, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.is_RWKV, shared.is_llamacpp]):
@@ -102,11 +101,11 @@ def load_model(model_name):
 
         model = load_quantized(model_name)
 
-    # LLAMACPP model
+    # llamacpp model
     elif shared.is_llamacpp:
         from modules.llamacpp_model import LlamaCppModel
 
-        if model_name.lower().startswith('alpaca-'):
+        if model_name.lower().startswith('alpaca-cpp'):
             model_file = f'models/{model_name}/ggml-alpaca-7b-q4.bin'
         else:
             model_file = f'models/{model_name}/ggml-model-q4_0.bin'

From 09b0a3aafb1a3b2d86912db0114b84ad3bc6029a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 31 Mar 2023 14:45:17 -0300
Subject: [PATCH 28/42] Add repetition_penalty

---
 modules/RWKV.py            | 2 +-
 modules/text_generation.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/RWKV.py b/modules/RWKV.py
index 8c7ea2b9..10c4c366 100644
--- a/modules/RWKV.py
+++ b/modules/RWKV.py
@@ -34,7 +34,7 @@ class RWKVModel:
         result.pipeline = pipeline
         return result
 
-    def generate(self, context="", token_count=20, temperature=1, top_p=1, top_k=50, alpha_frequency=0.1, alpha_presence=0.1, token_ban=[0], token_stop=[], callback=None):
+    def generate(self, context="", token_count=20, temperature=1, top_p=1, top_k=50, repetition_penalty=None, alpha_frequency=0.1, alpha_presence=0.1, token_ban=[0], token_stop=[], callback=None):
         args = PIPELINE_ARGS(
             temperature = temperature,
             top_p = top_p,
diff --git a/modules/text_generation.py b/modules/text_generation.py
index 8d54961e..b8b2f496 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -119,7 +119,7 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
     if any((shared.is_RWKV, shared.is_llamacpp)):
         try:
             if shared.args.no_stream:
-                reply = shared.model.generate(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k)
+                reply = shared.model.generate(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k, repetition_penalty=repetition_penalty)
                 if not (shared.args.chat or shared.args.cai_chat):
                     reply = original_question + apply_extensions(reply, "output")
                 yield formatted_outputs(reply, shared.model_name)

From a5c9b7d97763acda0ebc4db84ec6a2adfc093106 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 31 Mar 2023 15:08:01 -0300
Subject: [PATCH 29/42] Bump llamacpp version

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 29ede4c6..ffa6b51a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,7 @@ accelerate==0.18.0
 bitsandbytes==0.37.2
 flexgen==0.1.7
 gradio==3.24.0
-llamacpp==0.1.10
+llamacpp==0.1.11
 markdown
 numpy
 peft==0.2.0

From 5c4e44b4528c62a2243c27df22a1c028375debfe Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 31 Mar 2023 15:20:39 -0300
Subject: [PATCH 30/42] llama.cpp documentation

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 169c894b..82ca576a 100644
--- a/README.md
+++ b/README.md
@@ -28,6 +28,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 * [DeepSpeed ZeRO-3 offload](https://github.com/oobabooga/text-generation-webui/wiki/DeepSpeed).
 * Get responses via API, [with](https://github.com/oobabooga/text-generation-webui/blob/main/api-example-streaming.py) or [without](https://github.com/oobabooga/text-generation-webui/blob/main/api-example.py) streaming.
 * [LLaMA model, including 4-bit GPTQ support](https://github.com/oobabooga/text-generation-webui/wiki/LLaMA-model).
+* [llama.cpp support](https://github.com/oobabooga/text-generation-webui/wiki/llama.cpp-models). **\*NEW!\***
 * [RWKV model](https://github.com/oobabooga/text-generation-webui/wiki/RWKV-model).
 * [Supports LoRAs](https://github.com/oobabooga/text-generation-webui/wiki/Using-LoRAs).
 * Supports softprompts.

From 0aee7341d8e1890b911f8f3fa80f47905113d5d9 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 31 Mar 2023 17:00:55 -0300
Subject: [PATCH 31/42] Properly count tokens/s for llama.cpp in chat mode

---
 modules/text_generation.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/modules/text_generation.py b/modules/text_generation.py
index b8b2f496..f4cc25d4 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -120,6 +120,7 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
         try:
             if shared.args.no_stream:
                 reply = shared.model.generate(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k, repetition_penalty=repetition_penalty)
+                output = original_question+reply
                 if not (shared.args.chat or shared.args.cai_chat):
                     reply = original_question + apply_extensions(reply, "output")
                 yield formatted_outputs(reply, shared.model_name)
@@ -130,6 +131,7 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
                 # RWKV has proper streaming, which is very nice.
                 # No need to generate 8 tokens at a time.
                 for reply in shared.model.generate_with_streaming(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k):
+                    output = original_question+reply
                     if not (shared.args.chat or shared.args.cai_chat):
                         reply = original_question + apply_extensions(reply, "output")
                     yield formatted_outputs(reply, shared.model_name)
@@ -138,9 +140,9 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
             traceback.print_exc()
         finally:
             t1 = time.time()
-            output = encode(reply)[0]
-            input_ids = encode(question)
-            print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(input_ids[0])} tokens)")
+            original_tokens = len(encode(original_question)[0])
+            new_tokens = len(encode(output)[0]) - original_tokens
+            print(f"Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens})")
             return
 
     input_ids = encode(question, max_new_tokens)
@@ -272,5 +274,7 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
         traceback.print_exc()
     finally:
         t1 = time.time()
-        print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(original_input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(original_input_ids[0])} tokens, context {len(original_input_ids[0])})")
+        original_tokens = len(original_input_ids[0])
+        new_tokens = len(output)-original_tokens
+        print(f"Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens})")
         return

From 3a47a602a3866d70aa8b556eb7795b483cbef887 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 31 Mar 2023 17:18:21 -0300
Subject: [PATCH 32/42] Detect ggml*.bin files automatically

---
 modules/models.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/modules/models.py b/modules/models.py
index 80bbcab2..edcb3507 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -42,7 +42,7 @@ def load_model(model_name):
     t0 = time.time()
 
     shared.is_RWKV = 'rwkv-' in model_name.lower()
-    shared.is_llamacpp = model_name.lower().startswith(('llamacpp', 'alpaca-cpp'))
+    shared.is_llamacpp = len(list(Path(f'models/{model_name}').glob('ggml*.bin'))) > 0
 
     # Default settings
     if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.wbits, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.is_RWKV, shared.is_llamacpp]):
@@ -105,12 +105,10 @@ def load_model(model_name):
     elif shared.is_llamacpp:
         from modules.llamacpp_model import LlamaCppModel
 
-        if model_name.lower().startswith('alpaca-cpp'):
-            model_file = f'models/{model_name}/ggml-alpaca-7b-q4.bin'
-        else:
-            model_file = f'models/{model_name}/ggml-model-q4_0.bin'
+        model_file = list(Path(f'models/{model_name}').glob('ggml*.bin'))[0]
+        print(f"llama.cpp weights detected: {model_file}\n")
 
-        model, tokenizer = LlamaCppModel.from_pretrained(Path(model_file))
+        model, tokenizer = LlamaCppModel.from_pretrained(model_file)
         return model, tokenizer
 
     # Custom

From 6a44f4aec687085c3378c3dc89b889ac6f582e6b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 31 Mar 2023 17:33:10 -0300
Subject: [PATCH 33/42] Add support for downloading ggml files

---
 download-model.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/download-model.py b/download-model.py
index 7e5f61b2..93329725 100644
--- a/download-model.py
+++ b/download-model.py
@@ -97,6 +97,7 @@ def get_download_links_from_huggingface(model, branch):
     classifications = []
     has_pytorch = False
     has_pt = False
+    has_ggml = False
     has_safetensors = False
     is_lora = False
     while True:
@@ -114,6 +115,7 @@ def get_download_links_from_huggingface(model, branch):
             is_pytorch = re.match("(pytorch|adapter)_model.*\.bin", fname)
             is_safetensors = re.match(".*\.safetensors", fname)
             is_pt = re.match(".*\.pt", fname)
+            is_ggml = re.match(".*\.bin", fname)
             is_tokenizer = re.match("tokenizer.*\.model", fname)
             is_text = re.match(".*\.(txt|json|py|md)", fname) or is_tokenizer
 
@@ -135,6 +137,9 @@ def get_download_links_from_huggingface(model, branch):
                     elif is_pt:
                         has_pt = True
                         classifications.append('pt')
+                    elif is_ggml:
+                        has_ggml = True
+                        classifications.append('ggml')
 
         cursor = base64.b64encode(f'{{"file_name":"{dict[-1]["path"]}"}}'.encode()) + b':50'
         cursor = base64.b64encode(cursor)

From cbfe0b944a658a51c3cae3bf2bdaee4a52a9ff1b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 31 Mar 2023 17:49:11 -0300
Subject: [PATCH 34/42] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 82ca576a..0e653ef3 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # Text generation web UI
 
-A gradio web UI for running Large Language Models like GPT-J 6B, OPT, GALACTICA, LLaMA, and Pygmalion.
+A gradio web UI for running Large Language Models like LLaMA, llama.cpp, GPT-J, OPT, and GALACTICA.
 
 Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) of text generation.
 

From 875de5d983c1923455daae11bbbd85da6bb98555 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 31 Mar 2023 17:57:31 -0300
Subject: [PATCH 35/42] Update ggml template

---
 download-model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/download-model.py b/download-model.py
index 93329725..52c7a79e 100644
--- a/download-model.py
+++ b/download-model.py
@@ -115,7 +115,7 @@ def get_download_links_from_huggingface(model, branch):
             is_pytorch = re.match("(pytorch|adapter)_model.*\.bin", fname)
             is_safetensors = re.match(".*\.safetensors", fname)
             is_pt = re.match(".*\.pt", fname)
-            is_ggml = re.match(".*\.bin", fname)
+            is_ggml = re.match("ggml.*\.bin", fname)
             is_tokenizer = re.match("tokenizer.*\.model", fname)
             is_text = re.match(".*\.(txt|json|py|md)", fname) or is_tokenizer
 

From 2259143fecdc1cdaaefee9bce8ad58fba625fda8 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 31 Mar 2023 18:43:45 -0300
Subject: [PATCH 36/42] Fix llama.cpp with --no-stream

---
 modules/llamacpp_model.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py
index 6b9b1b52..1c67afb1 100644
--- a/modules/llamacpp_model.py
+++ b/modules/llamacpp_model.py
@@ -1,8 +1,5 @@
-from pathlib import Path
-
 import llamacpp
 
-import modules.shared as shared
 from modules.callbacks import Iteratorize
 
 
@@ -65,6 +62,7 @@ class LlamaCppModel:
                 self.model.eval()
                 token = self.model.sample()
                 text = self.model.token_to_str(token)
+                output += text
                 is_end_of_text = token == self.model.token_eos()
                 if callback:
                     callback(text)

From 52065ae4cd1a64c22db6867443b33e414677b0e5 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 31 Mar 2023 19:01:34 -0300
Subject: [PATCH 37/42] Add repetition_penalty

---
 modules/llamacpp_model.py  | 2 +-
 modules/text_generation.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py
index 1c67afb1..5304ab7f 100644
--- a/modules/llamacpp_model.py
+++ b/modules/llamacpp_model.py
@@ -48,7 +48,7 @@ class LlamaCppModel:
         params.repeat_penalty = repetition_penalty
         #params.repeat_last_n = repeat_last_n
 
-        # model.params = params
+        #self.model.params = params
         self.model.add_bos()
         self.model.update_input(context)
 
diff --git a/modules/text_generation.py b/modules/text_generation.py
index f4cc25d4..b7116d9a 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -130,7 +130,7 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
 
                 # RWKV has proper streaming, which is very nice.
                 # No need to generate 8 tokens at a time.
-                for reply in shared.model.generate_with_streaming(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k):
+                for reply in shared.model.generate(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k, repetition_penalty=repetition_penalty):
                     output = original_question+reply
                     if not (shared.args.chat or shared.args.cai_chat):
                         reply = original_question + apply_extensions(reply, "output")

From eeafd60713e7bca5c0203d995e95794766b30c60 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 31 Mar 2023 19:05:38 -0300
Subject: [PATCH 38/42] Fix streaming

---
 modules/text_generation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/text_generation.py b/modules/text_generation.py
index b7116d9a..6ae592db 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -130,7 +130,7 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
 
                 # RWKV has proper streaming, which is very nice.
                 # No need to generate 8 tokens at a time.
-                for reply in shared.model.generate(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k, repetition_penalty=repetition_penalty):
+                for reply in shared.model.generate_with_streaming(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k, repetition_penalty=repetition_penalty):
                     output = original_question+reply
                     if not (shared.args.chat or shared.args.cai_chat):
                         reply = original_question + apply_extensions(reply, "output")

From 2c52310642cdc6f435996fe3c8b42d381fcb0e62 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 31 Mar 2023 21:18:05 -0300
Subject: [PATCH 39/42] Add --threads flag for llama.cpp

---
 README.md                 | 31 ++++++++++-----------
 modules/llamacpp_model.py |  4 +++
 modules/shared.py         | 57 +++++++++++++++++++++++++--------------
 3 files changed, 57 insertions(+), 35 deletions(-)

diff --git a/README.md b/README.md
index 0e653ef3..e0784e12 100644
--- a/README.md
+++ b/README.md
@@ -176,24 +176,31 @@ Optionally, you can use the following command-line flags:
 | Flag             | Description |
 |------------------|-------------|
 | `-h`, `--help`   | show this help message and exit |
-| `--model MODEL`  | Name of the model to load by default. |
-| `--lora LORA`    | Name of the LoRA to apply to the model by default. |
 | `--notebook`     | Launch the web UI in notebook mode, where the output is written to the same text box as the input. |
 | `--chat`         | Launch the web UI in chat mode.|
 | `--cai-chat`     | Launch the web UI in chat mode with a style similar to Character.AI's. If the file `img_bot.png` or `img_bot.jpg` exists in the same folder as server.py, this image will be used as the bot's profile picture. Similarly, `img_me.png` or `img_me.jpg` will be used as your profile picture. |
+| `--model MODEL`  | Name of the model to load by default. |
+| `--lora LORA`    | Name of the LoRA to apply to the model by default. |
+|  `--model-dir MODEL_DIR`                    | Path to directory with all the models |
+|  `--lora-dir LORA_DIR`                      | Path to directory with all the loras |
+| `--no-stream`    | Don't stream the text output in real time. |
+| `--settings SETTINGS_FILE` | Load the default interface settings from this json file. See `settings-template.json` for an example. If you create a file called `settings.json`, this file will be loaded by default without the need to use the `--settings` flag.|
+|  `--extensions EXTENSIONS [EXTENSIONS ...]` | The list of extensions to load. If you want to load more than one extension, write the names separated by spaces. |
+|  `--verbose`                                | Print the prompts to the terminal. |
 | `--cpu`          | Use the CPU to generate text.|
+| `--auto-devices` | Automatically split the model across the available GPU(s) and CPU.|
+|  `--gpu-memory GPU_MEMORY [GPU_MEMORY ...]` |  Maxmimum GPU memory in GiB to be allocated per GPU. Example: `--gpu-memory 10` for a single GPU, `--gpu-memory 10 5` for two GPUs. You can also set values in MiB like `--gpu-memory 3500MiB`. |
+| `--cpu-memory CPU_MEMORY` | Maximum CPU memory in GiB to allocate for offloaded weights. Must be an integer number. Defaults to 99.|
+| `--disk`         | If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk. |
+| `--disk-cache-dir DISK_CACHE_DIR` | Directory to save the disk cache to. Defaults to `cache/`. |
 | `--load-in-8bit` | Load the model with 8-bit precision.|
+| `--bf16`         | Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU. |
+| `--no-cache`     | Set `use_cache` to False while generating text. This reduces the VRAM usage a bit with a performance cost. |
+| `--threads`     | Number of threads to use in llama.cpp. |
 | `--wbits WBITS`            | GPTQ: Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported. |
 | `--model_type MODEL_TYPE`  | GPTQ: Model type of pre-quantized model. Currently LLaMA, OPT, and GPT-J are supported. |
 | `--groupsize GROUPSIZE`    | GPTQ: Group size. |
 | `--pre_layer PRE_LAYER`    | GPTQ: The number of layers to preload. |
-| `--bf16`         | Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU. |
-| `--auto-devices` | Automatically split the model across the available GPU(s) and CPU.|
-| `--disk`         | If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk. |
-| `--disk-cache-dir DISK_CACHE_DIR` | Directory to save the disk cache to. Defaults to `cache/`. |
-|  `--gpu-memory GPU_MEMORY [GPU_MEMORY ...]` |  Maxmimum GPU memory in GiB to be allocated per GPU. Example: `--gpu-memory 10` for a single GPU, `--gpu-memory 10 5` for two GPUs. You can also set values in MiB like `--gpu-memory 3500MiB`. |
-| `--cpu-memory CPU_MEMORY` | Maximum CPU memory in GiB to allocate for offloaded weights. Must be an integer number. Defaults to 99.|
-| `--no-cache`     | Set `use_cache` to False while generating text. This reduces the VRAM usage a bit with a performance cost. |
 | `--flexgen`      | Enable the use of FlexGen offloading. |
 |  `--percent PERCENT [PERCENT ...]` |  FlexGen: allocation percentages. Must be 6 numbers separated by spaces (default: 0, 100, 100, 0, 100, 0). |
 |  `--compress-weight` |  FlexGen: Whether to compress weight (default: False).|
@@ -203,12 +210,6 @@ Optionally, you can use the following command-line flags:
 | `--local_rank LOCAL_RANK` | DeepSpeed: Optional argument for distributed setups. |
 |  `--rwkv-strategy RWKV_STRATEGY` |    RWKV: The strategy to use while loading the model. Examples: "cpu fp32", "cuda fp16", "cuda fp16i8". |
 |  `--rwkv-cuda-on` |   RWKV: Compile the CUDA kernel for better performance. |
-| `--no-stream`    | Don't stream the text output in real time. |
-| `--settings SETTINGS_FILE` | Load the default interface settings from this json file. See `settings-template.json` for an example. If you create a file called `settings.json`, this file will be loaded by default without the need to use the `--settings` flag.|
-|  `--extensions EXTENSIONS [EXTENSIONS ...]` | The list of extensions to load. If you want to load more than one extension, write the names separated by spaces. |
-|  `--model-dir MODEL_DIR`                    | Path to directory with all the models |
-|  `--lora-dir LORA_DIR`                      | Path to directory with all the loras |
-|  `--verbose`                                | Print the prompts to the terminal. |
 |  `--listen`                                 | Make the web UI reachable from your local network. |
 |  `--listen-port LISTEN_PORT`                | The listening port that the server will use. |
 |  `--share`                                  | Create a public URL. This is useful for running the web UI on Google Colab or similar. |
diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py
index 5304ab7f..4f491329 100644
--- a/modules/llamacpp_model.py
+++ b/modules/llamacpp_model.py
@@ -1,5 +1,8 @@
+import multiprocessing
+
 import llamacpp
 
+from modules import shared
 from modules.callbacks import Iteratorize
 
 
@@ -29,6 +32,7 @@ class LlamaCppModel:
     def from_pretrained(self, path):
         params = llamacpp.InferenceParams()
         params.path_model = str(path)
+        params.n_threads = shared.args.threads or multiprocessing.cpu_count() // 2
 
         _model = llamacpp.LlamaInference(params)
 
diff --git a/modules/shared.py b/modules/shared.py
index 6f3396a7..ff133a6d 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -68,51 +68,68 @@ def str2bool(v):
         raise argparse.ArgumentTypeError('Boolean value expected.')
 
 parser = argparse.ArgumentParser(formatter_class=lambda prog: argparse.HelpFormatter(prog,max_help_position=54))
-parser.add_argument('--model', type=str, help='Name of the model to load by default.')
-parser.add_argument('--lora', type=str, help='Name of the LoRA to apply to the model by default.')
+
+# Basic settings
 parser.add_argument('--notebook', action='store_true', help='Launch the web UI in notebook mode, where the output is written to the same text box as the input.')
 parser.add_argument('--chat', action='store_true', help='Launch the web UI in chat mode.')
 parser.add_argument('--cai-chat', action='store_true', help='Launch the web UI in chat mode with a style similar to Character.AI\'s. If the file img_bot.png or img_bot.jpg exists in the same folder as server.py, this image will be used as the bot\'s profile picture. Similarly, img_me.png or img_me.jpg will be used as your profile picture.')
-parser.add_argument('--cpu', action='store_true', help='Use the CPU to generate text.')
-parser.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision.')
+parser.add_argument('--model', type=str, help='Name of the model to load by default.')
+parser.add_argument('--lora', type=str, help='Name of the LoRA to apply to the model by default.')
+parser.add_argument("--model-dir", type=str, default='models/', help="Path to directory with all the models")
+parser.add_argument("--lora-dir", type=str, default='loras/', help="Path to directory with all the loras")
+parser.add_argument('--no-stream', action='store_true', help='Don\'t stream the text output in real time.')
+parser.add_argument('--settings', type=str, help='Load the default interface settings from this json file. See settings-template.json for an example. If you create a file called settings.json, this file will be loaded by default without the need to use the --settings flag.')
+parser.add_argument('--extensions', type=str, nargs="+", help='The list of extensions to load. If you want to load more than one extension, write the names separated by spaces.')
+parser.add_argument('--verbose', action='store_true', help='Print the prompts to the terminal.')
 
-parser.add_argument('--gptq-bits', type=int, default=0, help='DEPRECATED: use --wbits instead.')
-parser.add_argument('--gptq-model-type', type=str, help='DEPRECATED: use --model_type instead.')
-parser.add_argument('--gptq-pre-layer', type=int, default=0, help='DEPRECATED: use --pre_layer instead.')
+# Accelerate/transformers
+parser.add_argument('--cpu', action='store_true', help='Use the CPU to generate text.')
+parser.add_argument('--auto-devices', action='store_true', help='Automatically split the model across the available GPU(s) and CPU.')
+parser.add_argument('--gpu-memory', type=str, nargs="+", help='Maxmimum GPU memory in GiB to be allocated per GPU. Example: --gpu-memory 10 for a single GPU, --gpu-memory 10 5 for two GPUs.')
+parser.add_argument('--cpu-memory', type=str, help='Maximum CPU memory in GiB to allocate for offloaded weights. Must be an integer number. Defaults to 99.')
+parser.add_argument('--disk', action='store_true', help='If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.')
+parser.add_argument('--disk-cache-dir', type=str, default="cache", help='Directory to save the disk cache to. Defaults to "cache".')
+parser.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision.')
+parser.add_argument('--bf16', action='store_true', help='Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.')
+parser.add_argument('--no-cache', action='store_true', help='Set use_cache to False while generating text. This reduces the VRAM usage a bit at a performance cost.')
+
+# llama.cpp
+parser.add_argument('--threads', type=int, default=0, help='Number of threads to use in llama.cpp.')
+
+# GPTQ
 parser.add_argument('--wbits', type=int, default=0, help='GPTQ: Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.')
 parser.add_argument('--model_type', type=str, help='GPTQ: Model type of pre-quantized model. Currently LLaMA, OPT, and GPT-J are supported.')
 parser.add_argument('--groupsize', type=int, default=-1, help='GPTQ: Group size.')
 parser.add_argument('--pre_layer', type=int, default=0, help='GPTQ: The number of layers to preload.')
+parser.add_argument('--gptq-bits', type=int, default=0, help='DEPRECATED: use --wbits instead.')
+parser.add_argument('--gptq-model-type', type=str, help='DEPRECATED: use --model_type instead.')
+parser.add_argument('--gptq-pre-layer', type=int, default=0, help='DEPRECATED: use --pre_layer instead.')
 
-parser.add_argument('--bf16', action='store_true', help='Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.')
-parser.add_argument('--auto-devices', action='store_true', help='Automatically split the model across the available GPU(s) and CPU.')
-parser.add_argument('--disk', action='store_true', help='If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.')
-parser.add_argument('--disk-cache-dir', type=str, default="cache", help='Directory to save the disk cache to. Defaults to "cache".')
-parser.add_argument('--gpu-memory', type=str, nargs="+", help='Maxmimum GPU memory in GiB to be allocated per GPU. Example: --gpu-memory 10 for a single GPU, --gpu-memory 10 5 for two GPUs.')
-parser.add_argument('--cpu-memory', type=str, help='Maximum CPU memory in GiB to allocate for offloaded weights. Must be an integer number. Defaults to 99.')
-parser.add_argument('--no-cache', action='store_true', help='Set use_cache to False while generating text. This reduces the VRAM usage a bit at a performance cost.')
+# FlexGen
 parser.add_argument('--flexgen', action='store_true', help='Enable the use of FlexGen offloading.')
 parser.add_argument('--percent', type=int, nargs="+", default=[0, 100, 100, 0, 100, 0], help='FlexGen: allocation percentages. Must be 6 numbers separated by spaces (default: 0, 100, 100, 0, 100, 0).')
 parser.add_argument("--compress-weight", action="store_true", help="FlexGen: activate weight compression.")
 parser.add_argument("--pin-weight", type=str2bool, nargs="?", const=True, default=True, help="FlexGen: whether to pin weights (setting this to False reduces CPU memory by 20%%).")
+
+# DeepSpeed
 parser.add_argument('--deepspeed', action='store_true', help='Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.')
 parser.add_argument('--nvme-offload-dir', type=str, help='DeepSpeed: Directory to use for ZeRO-3 NVME offloading.')
 parser.add_argument('--local_rank', type=int, default=0, help='DeepSpeed: Optional argument for distributed setups.')
+
+# RWKV
 parser.add_argument('--rwkv-strategy', type=str, default=None, help='RWKV: The strategy to use while loading the model. Examples: "cpu fp32", "cuda fp16", "cuda fp16i8".')
 parser.add_argument('--rwkv-cuda-on', action='store_true', help='RWKV: Compile the CUDA kernel for better performance.')
-parser.add_argument('--no-stream', action='store_true', help='Don\'t stream the text output in real time.')
-parser.add_argument('--settings', type=str, help='Load the default interface settings from this json file. See settings-template.json for an example. If you create a file called settings.json, this file will be loaded by default without the need to use the --settings flag.')
-parser.add_argument('--extensions', type=str, nargs="+", help='The list of extensions to load. If you want to load more than one extension, write the names separated by spaces.')
-parser.add_argument("--model-dir", type=str, default='models/', help="Path to directory with all the models")
-parser.add_argument("--lora-dir", type=str, default='loras/', help="Path to directory with all the loras")
-parser.add_argument('--verbose', action='store_true', help='Print the prompts to the terminal.')
+
+# Gradio
 parser.add_argument('--listen', action='store_true', help='Make the web UI reachable from your local network.')
 parser.add_argument('--listen-port', type=int, help='The listening port that the server will use.')
 parser.add_argument('--share', action='store_true', help='Create a public URL. This is useful for running the web UI on Google Colab or similar.')
 parser.add_argument('--auto-launch', action='store_true', default=False, help='Open the web UI in the default browser upon launch.')
 parser.add_argument("--gradio-auth-path", type=str, help='Set the gradio authentication file path. The file should contain one or more user:password pairs in this format: "u1:p1,u2:p2,u3:p3"', default=None)
+
 args = parser.parse_args()
 
+
 # Provisional, this will be deleted later
 deprecated_dict = {'gptq_bits': ['wbits', 0], 'gptq_model_type': ['model_type', None], 'gptq_pre_layer': ['prelayer', 0]}
 for k in deprecated_dict:

From 74462ac713201fd6e8036fcc947c57b9c9924ebf Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 31 Mar 2023 22:52:52 -0300
Subject: [PATCH 40/42] Don't override the metadata when checking the sha256sum

---
 download-model.py | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/download-model.py b/download-model.py
index 75b031bd..43473215 100644
--- a/download-model.py
+++ b/download-model.py
@@ -197,20 +197,7 @@ if __name__ == '__main__':
     output_folder = f"{'_'.join(model.split('/')[-2:])}"
     if branch != 'main':
         output_folder += f'_{branch}'
-
-    # Creating the folder and writing the metadata
     output_folder = Path(base_folder) / output_folder
-    if not output_folder.exists():
-        output_folder.mkdir()
-    with open(output_folder / 'huggingface-metadata.txt', 'w') as f:
-        f.write(f'url: https://huggingface.co/{model}\n')
-        f.write(f'branch: {branch}\n')
-        f.write(f'download date: {str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))}\n')
-        sha256_str = ''
-        for i in range(len(sha256)):
-            sha256_str += f'    {sha256[i][1]} {sha256[i][0]}\n'
-        if sha256_str != '':
-            f.write(f'sha256sum:\n{sha256_str}')
 
     if args.check:
         # Validate the checksums
@@ -238,6 +225,20 @@ if __name__ == '__main__':
             print('[-] Invalid checksums. Rerun download-model.py with the --clean flag.')
 
     else:
+
+        # Creating the folder and writing the metadata
+        if not output_folder.exists():
+            output_folder.mkdir()
+        with open(output_folder / 'huggingface-metadata.txt', 'w') as f:
+            f.write(f'url: https://huggingface.co/{model}\n')
+            f.write(f'branch: {branch}\n')
+            f.write(f'download date: {str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))}\n')
+            sha256_str = ''
+            for i in range(len(sha256)):
+                sha256_str += f'    {sha256[i][1]} {sha256[i][0]}\n'
+            if sha256_str != '':
+                f.write(f'sha256sum:\n{sha256_str}')
+
         # Downloading the files
         print(f"Downloading the model to {output_folder}")
         download_files(links, output_folder, args.threads)
\ No newline at end of file

From 8c51b405e4927655f684dda0cb27a442de91e8d6 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 31 Mar 2023 23:41:10 -0300
Subject: [PATCH 41/42] Progress towards generalizing Interface mode tab

---
 server.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/server.py b/server.py
index 225a1f4c..ebd9c81e 100644
--- a/server.py
+++ b/server.py
@@ -217,10 +217,11 @@ def create_settings_menus(default_preset):
     shared.gradio['softprompts_menu'].change(load_soft_prompt, [shared.gradio['softprompts_menu']], [shared.gradio['softprompts_menu']], show_progress=True)
     shared.gradio['upload_softprompt'].upload(upload_soft_prompt, [shared.gradio['upload_softprompt']], [shared.gradio['softprompts_menu']])
 
-def set_interface_arguments(interface_mode, extensions, cmd_active):
+def set_interface_arguments(interface_mode, extensions, bool_active):
     modes = ["default", "notebook", "chat", "cai_chat"]
     cmd_list = vars(shared.args)
-    cmd_list = [k for k in cmd_list if type(cmd_list[k]) is bool and k not in modes]
+    bool_list = [k for k in cmd_list if type(cmd_list[k]) is bool and k not in modes]
+    #int_list = [k for k in cmd_list if type(k) is int]
 
     shared.args.extensions = extensions
     for k in modes[1:]:
@@ -228,9 +229,9 @@ def set_interface_arguments(interface_mode, extensions, cmd_active):
     if interface_mode != "default":
         exec(f"shared.args.{interface_mode} = True")
 
-    for k in cmd_list:
+    for k in bool_list:
         exec(f"shared.args.{k} = False")
-    for k in cmd_active:
+    for k in bool_active:
         exec(f"shared.args.{k} = True")
 
     shared.need_restart = True
@@ -483,16 +484,17 @@ def create_interface():
                     current_mode = mode
                     break
             cmd_list = vars(shared.args)
-            cmd_list = [k for k in cmd_list if type(cmd_list[k]) is bool and k not in modes]
-            active_cmd_list = [k for k in cmd_list if vars(shared.args)[k]]
+            bool_list = [k for k in cmd_list if type(cmd_list[k]) is bool and k not in modes]
+            bool_active = [k for k in bool_list if vars(shared.args)[k]]
+            #int_list = [k for k in cmd_list if type(k) is int]
 
             gr.Markdown("*Experimental*")
             shared.gradio['interface_modes_menu'] = gr.Dropdown(choices=modes, value=current_mode, label="Mode")
             shared.gradio['extensions_menu'] = gr.CheckboxGroup(choices=get_available_extensions(), value=shared.args.extensions, label="Available extensions")
-            shared.gradio['cmd_arguments_menu'] = gr.CheckboxGroup(choices=cmd_list, value=active_cmd_list, label="Boolean command-line flags")
+            shared.gradio['bool_menu'] = gr.CheckboxGroup(choices=bool_list, value=bool_active, label="Boolean command-line flags")
             shared.gradio['reset_interface'] = gr.Button("Apply and restart the interface", type="primary")
 
-            shared.gradio['reset_interface'].click(set_interface_arguments, [shared.gradio[k] for k in ['interface_modes_menu', 'extensions_menu', 'cmd_arguments_menu']], None)
+            shared.gradio['reset_interface'].click(set_interface_arguments, [shared.gradio[k] for k in ['interface_modes_menu', 'extensions_menu', 'bool_menu']], None)
             shared.gradio['reset_interface'].click(lambda : None, None, None, _js='() => {document.body.innerHTML=\'<h1 style="font-family:monospace;margin-top:20%;color:lightgray;text-align:center;">Reloading...</h1>\'; setTimeout(function(){location.reload()},2500); return []}')
 
         if shared.args.extensions is not None:

From fcda3f87767e642d1c0411776e549e1d3894843d Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 1 Apr 2023 01:12:13 -0300
Subject: [PATCH 42/42] Add also_return_rows to generate_chat_prompt

---
 modules/chat.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index cc3c45c7..db79e7db 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -22,7 +22,7 @@ def generate_chat_output(history, name1, name2, character):
     else:
         return history
 
-def generate_chat_prompt(user_input, max_new_tokens, name1, name2, context, chat_prompt_size, impersonate=False):
+def generate_chat_prompt(user_input, max_new_tokens, name1, name2, context, chat_prompt_size, impersonate=False, also_return_rows=False):
     user_input = fix_newlines(user_input)
     rows = [f"{context.strip()}\n"]
 
@@ -51,7 +51,11 @@ def generate_chat_prompt(user_input, max_new_tokens, name1, name2, context, chat
         rows.pop(1)
 
     prompt = ''.join(rows)
-    return prompt
+
+    if also_return_rows:
+        return prompt, rows
+    else:
+        return prompt
 
 def extract_message_from_reply(reply, name1, name2, check):
     next_character_found = False