From bd27353a08df9896ef6b560cde7e3dcb33b7d67d Mon Sep 17 00:00:00 2001
From: Maya <48323879+mayaeary@users.noreply.github.com>
Date: Sun, 19 Mar 2023 12:51:27 +0000
Subject: [PATCH 01/42] Fix duplicating server on ui reload

---
 extensions/api/script.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/extensions/api/script.py b/extensions/api/script.py
index 53e47f3f..1c57c72a 100644
--- a/extensions/api/script.py
+++ b/extensions/api/script.py
@@ -8,6 +8,8 @@ params = {
     'port': 5000,
 }
 
+server = None
+
 class Handler(BaseHTTPRequestHandler):
     def do_GET(self):
         if self.path == '/api/v1/model':
@@ -73,6 +75,7 @@ class Handler(BaseHTTPRequestHandler):
 
 
 def run_server():
+    global server
     server_addr = ('0.0.0.0' if shared.args.listen else '127.0.0.1', params['port'])
     server = ThreadingHTTPServer(server_addr, Handler)
     if shared.args.share: 
@@ -87,4 +90,5 @@ def run_server():
     server.serve_forever()
 
 def ui():
-    Thread(target=run_server, daemon=True).start()
\ No newline at end of file
+    if server is None:
+        Thread(target=run_server, daemon=True).start()
\ No newline at end of file

From 099d7a844b67caa4d33ad272a03d2ab9e18c2a0b Mon Sep 17 00:00:00 2001
From: Maya <48323879+mayaeary@users.noreply.github.com>
Date: Sun, 19 Mar 2023 13:22:24 +0000
Subject: [PATCH 02/42] Add setup method to extensions

---
 extensions/api/script.py | 8 ++------
 modules/extensions.py    | 8 ++++++++
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/extensions/api/script.py b/extensions/api/script.py
index 1c57c72a..bbd8551a 100644
--- a/extensions/api/script.py
+++ b/extensions/api/script.py
@@ -8,8 +8,6 @@ params = {
     'port': 5000,
 }
 
-server = None
-
 class Handler(BaseHTTPRequestHandler):
     def do_GET(self):
         if self.path == '/api/v1/model':
@@ -75,7 +73,6 @@ class Handler(BaseHTTPRequestHandler):
 
 
 def run_server():
-    global server
     server_addr = ('0.0.0.0' if shared.args.listen else '127.0.0.1', params['port'])
     server = ThreadingHTTPServer(server_addr, Handler)
     if shared.args.share: 
@@ -89,6 +86,5 @@ def run_server():
         print(f'Starting KoboldAI compatible api at http://{server_addr[0]}:{server_addr[1]}/api')
     server.serve_forever()
 
-def ui():
-    if server is None:
-        Thread(target=run_server, daemon=True).start()
\ No newline at end of file
+def setup():
+    Thread(target=run_server, daemon=True).start()
\ No newline at end of file
diff --git a/modules/extensions.py b/modules/extensions.py
index 836fbc60..9f11c882 100644
--- a/modules/extensions.py
+++ b/modules/extensions.py
@@ -5,6 +5,7 @@ import modules.shared as shared
 
 state = {}
 available_extensions = []
+setup_called = False
 
 def load_extensions():
     global state
@@ -44,6 +45,13 @@ def create_extensions_block():
                 if _id in shared.settings:
                     extension.params[param] = shared.settings[_id]
 
+    # Running setup function
+    if not setup_called:
+        for extension, name in iterator():
+            if hasattr(extension, "setup"):
+                extension.setup()
+        setup_called = True
+
     # Creating the extension ui elements
     if len(state) > 0:
         with gr.Box(elem_id="extensions"):

From 81c9d130f2a02410ce6da17806564677c09ab844 Mon Sep 17 00:00:00 2001
From: Maya <48323879+mayaeary@users.noreply.github.com>
Date: Sun, 19 Mar 2023 13:25:49 +0000
Subject: [PATCH 03/42] Fix global

---
 modules/extensions.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modules/extensions.py b/modules/extensions.py
index 9f11c882..defc0d66 100644
--- a/modules/extensions.py
+++ b/modules/extensions.py
@@ -37,6 +37,7 @@ def apply_extensions(text, typ):
     return text
 
 def create_extensions_block():
+    global setup_called
     # Updating the default values
     for extension, name in iterator():
         if hasattr(extension, 'params'):

From acdbd6b708b6c6e91b601a28cdfdb8d86a3cc395 Mon Sep 17 00:00:00 2001
From: Maya <48323879+mayaeary@users.noreply.github.com>
Date: Sun, 19 Mar 2023 13:31:21 +0000
Subject: [PATCH 04/42] Check if app should display extensions ui

---
 modules/extensions.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/modules/extensions.py b/modules/extensions.py
index defc0d66..b363bc39 100644
--- a/modules/extensions.py
+++ b/modules/extensions.py
@@ -46,15 +46,18 @@ def create_extensions_block():
                 if _id in shared.settings:
                     extension.params[param] = shared.settings[_id]
 
+    should_display_ui = False
     # Running setup function
     if not setup_called:
         for extension, name in iterator():
             if hasattr(extension, "setup"):
                 extension.setup()
+            if hasattr(extension, "ui"):
+                should_display_ui = True
         setup_called = True
 
     # Creating the extension ui elements
-    if len(state) > 0:
+    if should_display_ui:
         with gr.Box(elem_id="extensions"):
             gr.Markdown("Extensions")
             for extension, name in iterator():

From ca47e016b4f1651824ad5631a4dcf05ed0f5de4c Mon Sep 17 00:00:00 2001
From: Vladimir Belitskiy <belitskiy@gmail.com>
Date: Mon, 20 Mar 2023 12:55:57 -0400
Subject: [PATCH 05/42] Do not display empty user messages in chat mode.

There doesn't seem to be much value to them - they just take up space while also making it seem like there's still some sort of pseudo-dialogue going on, instead of a monologue by the bot.
---
 modules/html_generator.py | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/modules/html_generator.py b/modules/html_generator.py
index 940d5486..f8cff6d8 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -142,22 +142,23 @@ def generate_chat_html(history, name1, name2, character):
               </div>
             """
 
-        if not (i == len(history)-1 and len(row[0]) == 0):
-            output += f"""
-                  <div class="message">
-                    <div class="circle-you">
-                      {img_me}
-                    </div>
-                    <div class="text">
-                      <div class="username">
-                        {name1}
-                      </div>
-                      <div class="message-body">
-                        {row[0]}
-                      </div>
-                    </div>
+        if not row[0]:  # don't display empty user messages
+            continue
+        output += f"""
+              <div class="message">
+                <div class="circle-you">
+                  {img_me}
+                </div>
+                <div class="text">
+                  <div class="username">
+                    {name1}
                   </div>
-                """
+                  <div class="message-body">
+                    {row[0]}
+                  </div>
+                </div>
+              </div>
+            """
 
     output += "</div>"
     return output

From e96687b1d619bc3bb149cca0011adba01274963a Mon Sep 17 00:00:00 2001
From: Vladimir Belitskiy <belitskiy@gmail.com>
Date: Mon, 20 Mar 2023 14:16:48 -0400
Subject: [PATCH 06/42] Do not send empty user input as part of the prompt.

However, if extensions modify the empty prompt to be non-empty,
it'l still work as before.
---
 modules/chat.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index 36265990..c1e55ac4 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -33,12 +33,14 @@ def generate_chat_prompt(user_input, max_new_tokens, name1, name2, context, chat
     i = len(shared.history['internal'])-1
     while i >= 0 and len(encode(''.join(rows), max_new_tokens)[0]) < max_length:
         rows.insert(1, f"{name2}: {shared.history['internal'][i][1].strip()}\n")
-        if not (shared.history['internal'][i][0] == '<|BEGIN-VISIBLE-CHAT|>'):
+        prev_user_input = shared.history['internal'][i][0]
+        if prev_user_input and not shared.history['internal'][i][0] == '<|BEGIN-VISIBLE-CHAT|>':
             rows.insert(1, f"{name1}: {shared.history['internal'][i][0].strip()}\n")
         i -= 1
 
     if not impersonate:
-        rows.append(f"{name1}: {user_input}\n")
+        if user_input:
+            rows.append(f"{name1}: {user_input}\n")
         rows.append(apply_extensions(f"{name2}:", "bot_prefix"))
         limit = 3
     else:

From eac27f4f556b2e4fd149e65e2395fbc9ce2ea3c7 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 23 Mar 2023 00:55:33 -0300
Subject: [PATCH 07/42] Make LoRAs work in 16-bit mode

---
 modules/LoRA.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/modules/LoRA.py b/modules/LoRA.py
index 6915e157..20850338 100644
--- a/modules/LoRA.py
+++ b/modules/LoRA.py
@@ -13,10 +13,15 @@ def add_lora_to_model(lora_name):
         print("Reloading the model to remove the LoRA...")
         shared.model, shared.tokenizer = load_model(shared.model_name)
     else:
-        # Why doesn't this work in 16-bit mode?
         print(f"Adding the LoRA {lora_name} to the model...")
-
+        
         params = {}
-        params['device_map'] = {'': 0}
-        #params['dtype'] = shared.model.dtype
+        if shared.args.load_in_8bit:
+            params['device_map'] = {'': 0}
+        else:
+            params['device_map'] = 'auto' 
+            params['dtype'] = shared.model.dtype
+            
         shared.model = PeftModel.from_pretrained(shared.model, Path(f"loras/{lora_name}"), **params)
+        if not shared.args.load_in_8bit:
+            shared.model.half()

From 29bd41d453cc8404b7183af685cdd4b952e96435 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 23 Mar 2023 01:05:13 -0300
Subject: [PATCH 08/42] Fix LoRA in CPU mode

---
 modules/LoRA.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/LoRA.py b/modules/LoRA.py
index 20850338..0a2aaa7d 100644
--- a/modules/LoRA.py
+++ b/modules/LoRA.py
@@ -18,10 +18,10 @@ def add_lora_to_model(lora_name):
         params = {}
         if shared.args.load_in_8bit:
             params['device_map'] = {'': 0}
-        else:
+        elif not shared.args.cpu:
             params['device_map'] = 'auto' 
             params['dtype'] = shared.model.dtype
             
         shared.model = PeftModel.from_pretrained(shared.model, Path(f"loras/{lora_name}"), **params)
-        if not shared.args.load_in_8bit:
+        if not shared.args.load_in_8bit and not shared.args.cpu:
             shared.model.half()

From c5ebcc5f7e862b1f2c6b1d807bbf2c1aadeb159e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 23 Mar 2023 13:36:00 -0300
Subject: [PATCH 09/42] Change the default names (#518)

* Update shared.py

* Update settings-template.json
---
 modules/shared.py      | 6 +++---
 settings-template.json | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/modules/shared.py b/modules/shared.py
index 8d591f4f..720c697e 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -27,9 +27,9 @@ settings = {
     'max_new_tokens': 200,
     'max_new_tokens_min': 1,
     'max_new_tokens_max': 2000,
-    'name1': 'Person 1',
-    'name2': 'Person 2',
-    'context': 'This is a conversation between two people.',
+    'name1': 'You',
+    'name2': 'Assistant',
+    'context': 'This is a conversation with your Assistant. The Assistant is very helpful and is eager to chat with you and answer your questions.',
     'stop_at_newline': False,
     'chat_prompt_size': 2048,
     'chat_prompt_size_min': 0,
diff --git a/settings-template.json b/settings-template.json
index 7a7de7af..79fd5023 100644
--- a/settings-template.json
+++ b/settings-template.json
@@ -2,9 +2,9 @@
     "max_new_tokens": 200,
     "max_new_tokens_min": 1,
     "max_new_tokens_max": 2000,
-    "name1": "Person 1",
-    "name2": "Person 2",
-    "context": "This is a conversation between two people.",
+    "name1": "You",
+    "name2": "Assistant",
+    "context": "This is a conversation with your Assistant. The Assistant is very helpful and is eager to chat with you and answer your questions.",
     "stop_at_newline": false,
     "chat_prompt_size": 2048,
     "chat_prompt_size_min": 0,

From 9bf6ecf9e2de9b72c3fa62e0e6f5b5e9041825b1 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 23 Mar 2023 16:49:41 -0300
Subject: [PATCH 10/42] Fix LoRA device map (attempt)

---
 modules/LoRA.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/modules/LoRA.py b/modules/LoRA.py
index 0a2aaa7d..5f77e340 100644
--- a/modules/LoRA.py
+++ b/modules/LoRA.py
@@ -16,12 +16,15 @@ def add_lora_to_model(lora_name):
         print(f"Adding the LoRA {lora_name} to the model...")
         
         params = {}
-        if shared.args.load_in_8bit:
-            params['device_map'] = {'': 0}
-        elif not shared.args.cpu:
-            params['device_map'] = 'auto' 
+        if not shared.args.cpu:
             params['dtype'] = shared.model.dtype
+            if hasattr(shared.model, "hf_device_map"):
+                params['device_map'] = {"base_model.model."+k: v for k, v in shared.model.hf_device_map.items()}
+            elif shared.args.load_in_8bit:
+                params['device_map'] = {'': 0}
             
         shared.model = PeftModel.from_pretrained(shared.model, Path(f"loras/{lora_name}"), **params)
         if not shared.args.load_in_8bit and not shared.args.cpu:
             shared.model.half()
+            if not hasattr(shared.model, "hf_device_map"):
+                shared.model.cuda()

From 4578e88ffd77dc249fa97d0ec8cb667b21089ba8 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 23 Mar 2023 21:38:20 -0300
Subject: [PATCH 11/42] Stop the bot from talking for you in chat mode

---
 modules/RWKV.py            |  4 ++--
 modules/callbacks.py       | 20 ++++++++---------
 modules/chat.py            | 44 ++++++++++++++-----------------------
 modules/text_generation.py | 45 +++++++++++++++++++-------------------
 4 files changed, 51 insertions(+), 62 deletions(-)

diff --git a/modules/RWKV.py b/modules/RWKV.py
index 5cf8937a..8c7ea2b9 100644
--- a/modules/RWKV.py
+++ b/modules/RWKV.py
@@ -45,11 +45,11 @@ class RWKVModel:
             token_stop = token_stop
         )
 
-        return context+self.pipeline.generate(context, token_count=token_count, args=args, callback=callback)
+        return self.pipeline.generate(context, token_count=token_count, args=args, callback=callback)
 
     def generate_with_streaming(self, **kwargs):
         with Iteratorize(self.generate, kwargs, callback=None) as generator:
-            reply = kwargs['context']
+            reply = ''
             for token in generator:
                 reply += token
                 yield reply
diff --git a/modules/callbacks.py b/modules/callbacks.py
index 12a90cc3..2ae9d908 100644
--- a/modules/callbacks.py
+++ b/modules/callbacks.py
@@ -11,24 +11,22 @@ import modules.shared as shared
 # Copied from https://github.com/PygmalionAI/gradio-ui/
 class _SentinelTokenStoppingCriteria(transformers.StoppingCriteria):
 
-    def __init__(self, sentinel_token_ids: torch.LongTensor,
-                 starting_idx: int):
+    def __init__(self, sentinel_token_ids: list[torch.LongTensor], starting_idx: int):
         transformers.StoppingCriteria.__init__(self)
         self.sentinel_token_ids = sentinel_token_ids
         self.starting_idx = starting_idx
 
-    def __call__(self, input_ids: torch.LongTensor,
-                 _scores: torch.FloatTensor) -> bool:
+    def __call__(self, input_ids: torch.LongTensor, _scores: torch.FloatTensor) -> bool:
         for sample in input_ids:
             trimmed_sample = sample[self.starting_idx:]
-            # Can't unfold, output is still too tiny. Skip.
-            if trimmed_sample.shape[-1] < self.sentinel_token_ids.shape[-1]:
-                continue
 
-            for window in trimmed_sample.unfold(
-                    0, self.sentinel_token_ids.shape[-1], 1):
-                if torch.all(torch.eq(self.sentinel_token_ids, window)):
-                    return True
+            for i in range(len(self.sentinel_token_ids)):
+                # Can't unfold, output is still too tiny. Skip.
+                if trimmed_sample.shape[-1] < self.sentinel_token_ids[i].shape[-1]:
+                    continue
+                for window in trimmed_sample.unfold(0, self.sentinel_token_ids[i].shape[-1], 1):
+                    if torch.all(torch.eq(self.sentinel_token_ids[i], window)):
+                        return True
         return False
 
 class Stream(transformers.StoppingCriteria):
diff --git a/modules/chat.py b/modules/chat.py
index 78fc4ab5..b1280d48 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -51,41 +51,31 @@ def generate_chat_prompt(user_input, max_new_tokens, name1, name2, context, chat
     prompt = ''.join(rows)
     return prompt
 
-def extract_message_from_reply(question, reply, name1, name2, check, impersonate=False):
+def extract_message_from_reply(reply, name1, name2, check):
     next_character_found = False
 
-    asker = name1 if not impersonate else name2
-    replier = name2 if not impersonate else name1
-
-    previous_idx = [m.start() for m in re.finditer(f"(^|\n){re.escape(replier)}:", question)]
-    idx = [m.start() for m in re.finditer(f"(^|\n){re.escape(replier)}:", reply)]
-    idx = idx[max(len(previous_idx)-1, 0)]
-
-    if not impersonate:
-        reply = reply[idx + 1 + len(apply_extensions(f"{replier}:", "bot_prefix")):]
-    else:
-        reply = reply[idx + 1 + len(f"{replier}:"):]
-
     if check:
         lines = reply.split('\n')
         reply = lines[0].strip()
         if len(lines) > 1:
             next_character_found = True
     else:
-        idx = reply.find(f"\n{asker}:")
-        if idx != -1:
-            reply = reply[:idx]
-            next_character_found = True
-        reply = fix_newlines(reply)
+        for string in [f"\n{name1}:", f"\n{name2}:"]:
+            idx = reply.find(string)
+            if idx != -1:
+                reply = reply[:idx]
+                next_character_found = True
 
         # If something like "\nYo" is generated just before "\nYou:"
         # is completed, trim it
-        next_turn = f"\n{asker}:"
-        for j in range(len(next_turn)-1, 0, -1):
-            if reply[-j:] == next_turn[:j]:
-                reply = reply[:-j]
-                break
+        if not next_character_found:
+            for string in [f"\n{name1}:", f"\n{name2}:"]:
+                for j in range(len(string)-1, 0, -1):
+                    if reply[-j:] == string[:j]:
+                        reply = reply[:-j]
+                        break
 
+    reply = fix_newlines(reply)
     return reply, next_character_found
 
 def stop_everything_event():
@@ -127,10 +117,10 @@ def chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical
     # Generate
     reply = ''
     for i in range(chat_generation_attempts):
-        for reply in generate_reply(f"{prompt}{' ' if len(reply) > 0 else ''}{reply}", max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=eos_token, stopping_string=f"\n{name1}:"):
+        for reply in generate_reply(f"{prompt}{' ' if len(reply) > 0 else ''}{reply}", max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=eos_token, stopping_strings=[f"\n{name1}:", f"\n{name2}:"]):
 
             # Extracting the reply
-            reply, next_character_found = extract_message_from_reply(prompt, reply, name1, name2, check)
+            reply, next_character_found = extract_message_from_reply(reply, name1, name2, check)
             visible_reply = re.sub("(<USER>|<user>|{{user}})", name1_original, reply)
             visible_reply = apply_extensions(visible_reply, "output")
             if shared.args.chat:
@@ -166,8 +156,8 @@ def impersonate_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typ
     # Yield *Is typing...*
     yield shared.processing_message
     for i in range(chat_generation_attempts):
-        for reply in generate_reply(prompt+reply, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=eos_token, stopping_string=f"\n{name2}:"):
-            reply, next_character_found = extract_message_from_reply(prompt, reply, name1, name2, check, impersonate=True)
+        for reply in generate_reply(prompt+reply, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=eos_token, stopping_strings=[f"\n{name1}:", f"\n{name2}:"]):
+            reply, next_character_found = extract_message_from_reply(reply, name1, name2, check)
             yield reply
             if next_character_found:
                 break
diff --git a/modules/text_generation.py b/modules/text_generation.py
index e738cb21..fd017e2c 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -99,25 +99,37 @@ def set_manual_seed(seed):
         if torch.cuda.is_available():
             torch.cuda.manual_seed_all(seed)
 
-def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=None, stopping_string=None):
+def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=None, stopping_strings=[]):
     clear_torch_cache()
     set_manual_seed(seed)
     t0 = time.time()
 
+    original_question = question
+    if not (shared.args.chat or shared.args.cai_chat):
+        question = apply_extensions(question, "input")
+    if shared.args.verbose:
+        print(f"\n\n{question}\n--------------------\n")
+
     # These models are not part of Hugging Face, so we handle them
     # separately and terminate the function call earlier
     if shared.is_RWKV:
         try:
             if shared.args.no_stream:
                 reply = shared.model.generate(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k)
+                if not (shared.args.chat or shared.args.cai_chat):
+                    reply = original_question + apply_extensions(reply, "output")
                 yield formatted_outputs(reply, shared.model_name)
             else:
                 if not (shared.args.chat or shared.args.cai_chat):
                     yield formatted_outputs(question, shared.model_name)
+
                 # RWKV has proper streaming, which is very nice.
                 # No need to generate 8 tokens at a time.
                 for reply in shared.model.generate_with_streaming(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k):
+                    if not (shared.args.chat or shared.args.cai_chat):
+                        reply = original_question + apply_extensions(reply, "output")
                     yield formatted_outputs(reply, shared.model_name)
+
         except Exception:
             traceback.print_exc()
         finally:
@@ -127,12 +139,6 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
             print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(input_ids[0])} tokens)")
             return
 
-    original_question = question
-    if not (shared.args.chat or shared.args.cai_chat):
-        question = apply_extensions(question, "input")
-    if shared.args.verbose:
-        print(f"\n\n{question}\n--------------------\n")
-
     input_ids = encode(question, max_new_tokens)
     original_input_ids = input_ids
     output = input_ids[0]
@@ -142,9 +148,8 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
     if eos_token is not None:
         eos_token_ids.append(int(encode(eos_token)[0][-1]))
     stopping_criteria_list = transformers.StoppingCriteriaList()
-    if stopping_string is not None:
-        # Copied from https://github.com/PygmalionAI/gradio-ui/blob/master/src/model.py
-        t = encode(stopping_string, 0, add_special_tokens=False)
+    if type(stopping_strings) is list and len(stopping_strings) > 0:
+        t = [encode(string, 0, add_special_tokens=False) for string in stopping_strings]
         stopping_criteria_list.append(_SentinelTokenStoppingCriteria(sentinel_token_ids=t, starting_idx=len(input_ids[0])))
 
     generate_params = {}
@@ -195,12 +200,10 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
             if shared.soft_prompt:
                 output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
 
+            new_tokens = len(output) - len(input_ids[0])
+            reply = decode(output[-new_tokens:])
             if not (shared.args.chat or shared.args.cai_chat):
-                new_tokens = len(output) - len(input_ids[0])
-                reply = decode(output[-new_tokens:])
                 reply = original_question + apply_extensions(reply, "output")
-            else:
-                reply = decode(output)
 
             yield formatted_outputs(reply, shared.model_name)
 
@@ -223,12 +226,11 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
                 for output in generator:
                     if shared.soft_prompt:
                         output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
+
+                    new_tokens = len(output) - len(input_ids[0])
+                    reply = decode(output[-new_tokens:])
                     if not (shared.args.chat or shared.args.cai_chat):
-                        new_tokens = len(output) - len(input_ids[0])
-                        reply = decode(output[-new_tokens:])
                         reply = original_question + apply_extensions(reply, "output")
-                    else:
-                        reply = decode(output)
 
                     if output[-1] in eos_token_ids:
                         break
@@ -244,12 +246,11 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
                     output = shared.model.generate(**generate_params)[0]
                 if shared.soft_prompt:
                     output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
+
+                new_tokens = len(output) - len(original_input_ids[0])
+                reply = decode(output[-new_tokens:])
                 if not (shared.args.chat or shared.args.cai_chat):
-                    new_tokens = len(output) - len(original_input_ids[0])
-                    reply = decode(output[-new_tokens:])
                     reply = original_question + apply_extensions(reply, "output")
-                else:
-                    reply = decode(output)
 
                 if np.count_nonzero(np.isin(input_ids[0], eos_token_ids)) < np.count_nonzero(np.isin(output, eos_token_ids)):
                     break

From bf22d16ebcee96430d6845c9786bbdab5e74af17 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 23 Mar 2023 21:56:26 -0300
Subject: [PATCH 12/42] Clear cache while switching LoRAs

---
 modules/LoRA.py      | 15 +++++++++------
 modules/callbacks.py |  8 +-------
 server.py            | 14 +++-----------
 3 files changed, 13 insertions(+), 24 deletions(-)

diff --git a/modules/LoRA.py b/modules/LoRA.py
index 5f77e340..1c03826b 100644
--- a/modules/LoRA.py
+++ b/modules/LoRA.py
@@ -2,19 +2,22 @@ from pathlib import Path
 
 import modules.shared as shared
 from modules.models import load_model
+from modules.text_generation import clear_torch_cache
 
 
+def reload_model():
+    shared.model = shared.tokenizer = None
+    clear_torch_cache()
+    shared.model, shared.tokenizer = load_model(shared.model_name)
+
 def add_lora_to_model(lora_name):
 
     from peft import PeftModel
 
-    # Is there a more efficient way of returning to the base model?
-    if lora_name == "None":
-        print("Reloading the model to remove the LoRA...")
-        shared.model, shared.tokenizer = load_model(shared.model_name)
-    else:
+    reload_model()
+
+    if lora_name != "None":
         print(f"Adding the LoRA {lora_name} to the model...")
-        
         params = {}
         if not shared.args.cpu:
             params['dtype'] = shared.model.dtype
diff --git a/modules/callbacks.py b/modules/callbacks.py
index 2ae9d908..50a69183 100644
--- a/modules/callbacks.py
+++ b/modules/callbacks.py
@@ -1,11 +1,10 @@
-import gc
 from queue import Queue
 from threading import Thread
 
 import torch
 import transformers
 
-import modules.shared as shared
+from modules.text_generation import clear_torch_cache
 
 
 # Copied from https://github.com/PygmalionAI/gradio-ui/
@@ -90,8 +89,3 @@ class Iteratorize:
     def __exit__(self, exc_type, exc_val, exc_tb):
         self.stop_now = True
         clear_torch_cache()
-
-def clear_torch_cache():
-    gc.collect()
-    if not shared.args.cpu:
-        torch.cuda.empty_cache()
diff --git a/server.py b/server.py
index cdf7aa93..068f380a 100644
--- a/server.py
+++ b/server.py
@@ -1,4 +1,3 @@
-import gc
 import io
 import json
 import re
@@ -8,7 +7,6 @@ import zipfile
 from pathlib import Path
 
 import gradio as gr
-import torch
 
 import modules.chat as chat
 import modules.extensions as extensions_module
@@ -17,7 +15,7 @@ import modules.ui as ui
 from modules.html_generator import generate_chat_html
 from modules.LoRA import add_lora_to_model
 from modules.models import load_model, load_soft_prompt
-from modules.text_generation import generate_reply
+from modules.text_generation import clear_torch_cache, generate_reply
 
 # Loading custom settings
 settings_file = None
@@ -56,21 +54,15 @@ def load_model_wrapper(selected_model):
     if selected_model != shared.model_name:
         shared.model_name = selected_model
         shared.model = shared.tokenizer = None
-        if not shared.args.cpu:
-            gc.collect()
-            torch.cuda.empty_cache()
+        clear_torch_cache()
         shared.model, shared.tokenizer = load_model(shared.model_name)
 
     return selected_model
 
 def load_lora_wrapper(selected_lora):
     shared.lora_name = selected_lora
-    default_text = shared.settings['lora_prompts'][next((k for k in shared.settings['lora_prompts'] if re.match(k.lower(), shared.lora_name.lower())), 'default')]
-
-    if not shared.args.cpu:
-        gc.collect()
-        torch.cuda.empty_cache()
     add_lora_to_model(selected_lora)
+    default_text = shared.settings['lora_prompts'][next((k for k in shared.settings['lora_prompts'] if re.match(k.lower(), shared.lora_name.lower())), 'default')]
 
     return selected_lora, default_text
 

From b0abb327d822f8fe4c0180a4a725c0e362182b8f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 23 Mar 2023 22:02:09 -0300
Subject: [PATCH 13/42] Update LoRA.py

---
 modules/LoRA.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/modules/LoRA.py b/modules/LoRA.py
index 1c03826b..aa68ad32 100644
--- a/modules/LoRA.py
+++ b/modules/LoRA.py
@@ -14,7 +14,11 @@ def add_lora_to_model(lora_name):
 
     from peft import PeftModel
 
-    reload_model()
+    # If a LoRA had been previously loaded, or if we want
+    # to unload a LoRA, reload the model
+    if shared.lora_name != "None" or lora_name == "None":
+        reload_model()
+    shared.lora_name = lora_name
 
     if lora_name != "None":
         print(f"Adding the LoRA {lora_name} to the model...")

From 9bdb3c784d07b4f81f8dc39a97796d231bd89bff Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 23 Mar 2023 22:02:40 -0300
Subject: [PATCH 14/42] Minor fix

---
 server.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/server.py b/server.py
index 068f380a..435b8525 100644
--- a/server.py
+++ b/server.py
@@ -60,7 +60,6 @@ def load_model_wrapper(selected_model):
     return selected_model
 
 def load_lora_wrapper(selected_lora):
-    shared.lora_name = selected_lora
     add_lora_to_model(selected_lora)
     default_text = shared.settings['lora_prompts'][next((k for k in shared.settings['lora_prompts'] if re.match(k.lower(), shared.lora_name.lower())), 'default')]
 

From d1327f99f915aca83abac739107cdb8c5d29d278 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 23 Mar 2023 22:12:24 -0300
Subject: [PATCH 15/42] Fix broken callbacks.py

---
 modules/callbacks.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/modules/callbacks.py b/modules/callbacks.py
index 50a69183..93cd1d63 100644
--- a/modules/callbacks.py
+++ b/modules/callbacks.py
@@ -4,8 +4,6 @@ from threading import Thread
 import torch
 import transformers
 
-from modules.text_generation import clear_torch_cache
-
 
 # Copied from https://github.com/PygmalionAI/gradio-ui/
 class _SentinelTokenStoppingCriteria(transformers.StoppingCriteria):
@@ -89,3 +87,8 @@ class Iteratorize:
     def __exit__(self, exc_type, exc_val, exc_tb):
         self.stop_now = True
         clear_torch_cache()
+
+def clear_torch_cache():
+    gc.collect()
+    if not shared.args.cpu:
+        torch.cuda.empty_cache()

From 7078d168c31084255a99e1b4fd879e9a8a353a0d Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 23 Mar 2023 22:16:08 -0300
Subject: [PATCH 16/42] Missing import

---
 modules/callbacks.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modules/callbacks.py b/modules/callbacks.py
index 93cd1d63..40811408 100644
--- a/modules/callbacks.py
+++ b/modules/callbacks.py
@@ -1,3 +1,4 @@
+import gc
 from queue import Queue
 from threading import Thread
 

From 8747c74339cf1e7f1d45f4aa1dcc090e9eba94a3 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 23 Mar 2023 22:19:01 -0300
Subject: [PATCH 17/42] Another missing import

---
 modules/callbacks.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modules/callbacks.py b/modules/callbacks.py
index 40811408..2ae9d908 100644
--- a/modules/callbacks.py
+++ b/modules/callbacks.py
@@ -5,6 +5,8 @@ from threading import Thread
 import torch
 import transformers
 
+import modules.shared as shared
+
 
 # Copied from https://github.com/PygmalionAI/gradio-ui/
 class _SentinelTokenStoppingCriteria(transformers.StoppingCriteria):

From dcfd866402dfbbc849bd4441fd1de9448de18c75 Mon Sep 17 00:00:00 2001
From: EyeDeck <eyedeck@gmail.com>
Date: Thu, 23 Mar 2023 21:31:34 -0400
Subject: [PATCH 18/42] Allow loading of .safetensors through GPTQ-for-LLaMa

---
 modules/GPTQ_loader.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/modules/GPTQ_loader.py b/modules/GPTQ_loader.py
index 32a5458f..bec6c66f 100644
--- a/modules/GPTQ_loader.py
+++ b/modules/GPTQ_loader.py
@@ -37,21 +37,23 @@ def load_quantized(model_name):
 
     path_to_model = Path(f'models/{model_name}')
     if path_to_model.name.lower().startswith('llama-7b'):
-        pt_model = f'llama-7b-{shared.args.gptq_bits}bit.pt'
+        pt_model = f'llama-7b-{shared.args.gptq_bits}bit'
     elif path_to_model.name.lower().startswith('llama-13b'):
-        pt_model = f'llama-13b-{shared.args.gptq_bits}bit.pt'
+        pt_model = f'llama-13b-{shared.args.gptq_bits}bit'
     elif path_to_model.name.lower().startswith('llama-30b'):
-        pt_model = f'llama-30b-{shared.args.gptq_bits}bit.pt'
+        pt_model = f'llama-30b-{shared.args.gptq_bits}bit'
     elif path_to_model.name.lower().startswith('llama-65b'):
-        pt_model = f'llama-65b-{shared.args.gptq_bits}bit.pt'
+        pt_model = f'llama-65b-{shared.args.gptq_bits}bit'
     else:
-        pt_model = f'{model_name}-{shared.args.gptq_bits}bit.pt'
+        pt_model = f'{model_name}-{shared.args.gptq_bits}bit'
 
-    # Try to find the .pt both in models/ and in the subfolder
+    # Try to find the .safetensors or .pt both in models/ and in the subfolder
     pt_path = None
-    for path in [Path(p) for p in [f"models/{pt_model}", f"{path_to_model}/{pt_model}"]]:
+    for path in [Path(p+ext) for ext in ['.safetensors', '.pt'] for p in [f"models/{pt_model}", f"{path_to_model}/{pt_model}"]]:
         if path.exists():
+            print(f"Found {path}")
             pt_path = path
+            break
 
     if not pt_path:
         print(f"Could not find {pt_model}, exiting...")

From 143b5b5edf5d47539496598dbdb6cfe4843c169a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 23 Mar 2023 23:28:50 -0300
Subject: [PATCH 19/42] Mention one-click-bandaid in the README

---
 README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README.md b/README.md
index cb070445..85dcc270 100644
--- a/README.md
+++ b/README.md
@@ -101,6 +101,10 @@ Just download the zip above, extract it, and double click on "install". The web
 
 Source codes: https://github.com/oobabooga/one-click-installers
 
+> **Note**
+> 
+> To get 8-bit and 4-bit models working in your 1-click Windows installation, you can use the [one-click-bandaid](https://github.com/ClayShoaf/oobabooga-one-click-bandaid).
+
 This method lags behind the newest developments and does not support 8-bit mode on Windows without additional set up: https://github.com/oobabooga/text-generation-webui/issues/147#issuecomment-1456040134, https://github.com/oobabooga/text-generation-webui/issues/20#issuecomment-1411650652
 
 ### Alternative: Docker

From bb4cb2245373acb950e1c8dbaa73caf75920723d Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 24 Mar 2023 00:49:04 -0300
Subject: [PATCH 20/42] Download .pt files using download-model.py (for 4-bit
 models)

---
 download-model.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/download-model.py b/download-model.py
index 7c2965f6..7ca33b7d 100644
--- a/download-model.py
+++ b/download-model.py
@@ -116,10 +116,11 @@ def get_download_links_from_huggingface(model, branch):
 
             is_pytorch = re.match("(pytorch|adapter)_model.*\.bin", fname)
             is_safetensors = re.match("model.*\.safetensors", fname)
+            is_pt = re.match(".*\.pt", fname)
             is_tokenizer = re.match("tokenizer.*\.model", fname)
             is_text = re.match(".*\.(txt|json|py)", fname) or is_tokenizer
 
-            if any((is_pytorch, is_safetensors, is_text, is_tokenizer)):
+            if any((is_pytorch, is_safetensors, is_pt, is_tokenizer, is_text)):
                 if is_text:
                     links.append(f"https://huggingface.co/{model}/resolve/{branch}/{fname}")
                     classifications.append('text')
@@ -132,7 +133,8 @@ def get_download_links_from_huggingface(model, branch):
                     elif is_pytorch:
                         has_pytorch = True
                         classifications.append('pytorch')
-
+                    elif is_pt:
+                        classifications.append('pt')
 
         cursor = base64.b64encode(f'{{"file_name":"{dict[-1]["path"]}"}}'.encode()) + b':50'
         cursor = base64.b64encode(cursor)

From 04417b658b53207c805851145c96bc1ce903937b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 24 Mar 2023 01:40:43 -0300
Subject: [PATCH 21/42] Update README.md

---
 README.md | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 85dcc270..4e4959ac 100644
--- a/README.md
+++ b/README.md
@@ -84,10 +84,6 @@ pip install -r requirements.txt
 > 
 > For bitsandbytes and `--load-in-8bit` to work on Linux/WSL, this dirty fix is currently necessary: https://github.com/oobabooga/text-generation-webui/issues/400#issuecomment-1474876859
 
-### Alternative: native Windows installation
-
-As an alternative to the recommended WSL method, you can install the web UI natively on Windows using this guide. It will be a lot harder and the performance may be slower: [Installation instructions for human beings](https://github.com/oobabooga/text-generation-webui/wiki/Installation-instructions-for-human-beings).
-
 ### Alternative: one-click installers
 
 [oobabooga-windows.zip](https://github.com/oobabooga/one-click-installers/archive/refs/heads/oobabooga-windows.zip)
@@ -105,7 +101,9 @@ Source codes: https://github.com/oobabooga/one-click-installers
 > 
 > To get 8-bit and 4-bit models working in your 1-click Windows installation, you can use the [one-click-bandaid](https://github.com/ClayShoaf/oobabooga-one-click-bandaid).
 
-This method lags behind the newest developments and does not support 8-bit mode on Windows without additional set up: https://github.com/oobabooga/text-generation-webui/issues/147#issuecomment-1456040134, https://github.com/oobabooga/text-generation-webui/issues/20#issuecomment-1411650652
+### Alternative: native Windows installation
+
+As an alternative to the recommended WSL method, you can install the web UI natively on Windows using this guide. It will be a lot harder and the performance may be slower: [Installation instructions for human beings](https://github.com/oobabooga/text-generation-webui/wiki/Installation-instructions-for-human-beings).
 
 ### Alternative: Docker
 

From 4f5c2ce78560689dc8ed08a3cbb33ef15a3b4a95 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 24 Mar 2023 02:03:30 -0300
Subject: [PATCH 22/42] Fix chat_generation_attempts

---
 modules/chat.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index b1280d48..061177d2 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -115,9 +115,10 @@ def chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical
         yield shared.history['visible']+[[visible_text, shared.processing_message]]
 
     # Generate
-    reply = ''
+    cumulative_reply = ''
     for i in range(chat_generation_attempts):
-        for reply in generate_reply(f"{prompt}{' ' if len(reply) > 0 else ''}{reply}", max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=eos_token, stopping_strings=[f"\n{name1}:", f"\n{name2}:"]):
+        for reply in generate_reply(f"{prompt}{' ' if len(cumulative_reply) > 0 else ''}{cumulative_reply}", max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=eos_token, stopping_strings=[f"\n{name1}:", f"\n{name2}:"]):
+            reply = cumulative_reply + reply
 
             # Extracting the reply
             reply, next_character_found = extract_message_from_reply(reply, name1, name2, check)
@@ -142,6 +143,8 @@ def chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical
             if next_character_found:
                 break
 
+        cumulative_reply = reply
+
     yield shared.history['visible']
 
 def impersonate_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, name1, name2, context, check, chat_prompt_size, chat_generation_attempts=1):
@@ -152,16 +155,21 @@ def impersonate_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typ
 
     prompt = generate_chat_prompt(text, max_new_tokens, name1, name2, context, chat_prompt_size, impersonate=True)
 
-    reply = ''
     # Yield *Is typing...*
     yield shared.processing_message
+
+    cumulative_reply = ''
     for i in range(chat_generation_attempts):
-        for reply in generate_reply(prompt+reply, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=eos_token, stopping_strings=[f"\n{name1}:", f"\n{name2}:"]):
+        for reply in generate_reply(f"{prompt}{' ' if len(cumulative_reply) > 0 else ''}{cumulative_reply}", max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=eos_token, stopping_strings=[f"\n{name1}:", f"\n{name2}:"]):
+            reply = cumulative_reply + reply
             reply, next_character_found = extract_message_from_reply(reply, name1, name2, check)
             yield reply
             if next_character_found:
                 break
-        yield reply
+
+        cumulative_reply = reply
+
+    yield reply
 
 def cai_chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, name1, name2, context, check, chat_prompt_size, chat_generation_attempts=1):
     for _history in chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, name1, name2, context, check, chat_prompt_size, chat_generation_attempts):

From b740c5b2847ee778c20f8232d94f25ab84fce108 Mon Sep 17 00:00:00 2001
From: Forkoz <59298527+Ph0rk0z@users.noreply.github.com>
Date: Fri, 24 Mar 2023 08:56:07 -0500
Subject: [PATCH 23/42] Add display of context when input was generated

Not sure if I did this right but it does move with the conversation and seems to match value.
---
 modules/text_generation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/text_generation.py b/modules/text_generation.py
index fd017e2c..9b2c233d 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -270,5 +270,5 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
         traceback.print_exc()
     finally:
         t1 = time.time()
-        print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(original_input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(original_input_ids[0])} tokens)")
+        print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(original_input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(original_input_ids[0])} tokens, context {len(original_input_ids[0])})")
         return

From fd99995b01878246b62302d31a844dd68ee7d139 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 24 Mar 2023 15:59:27 -0300
Subject: [PATCH 24/42] Make the Stop button more consistent in chat mode

---
 server.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server.py b/server.py
index 435b8525..7b25e91d 100644
--- a/server.py
+++ b/server.py
@@ -329,7 +329,7 @@ def create_interface():
             gen_events.append(shared.gradio['textbox'].submit(eval(function_call), shared.input_params, shared.gradio['display'], show_progress=shared.args.no_stream))
             gen_events.append(shared.gradio['Regenerate'].click(chat.regenerate_wrapper, shared.input_params, shared.gradio['display'], show_progress=shared.args.no_stream))
             gen_events.append(shared.gradio['Impersonate'].click(chat.impersonate_wrapper, shared.input_params, shared.gradio['textbox'], show_progress=shared.args.no_stream))
-            shared.gradio['Stop'].click(chat.stop_everything_event, [], [], cancels=gen_events)
+            shared.gradio['Stop'].click(chat.stop_everything_event, [], [], cancels=gen_events, queue=False)
 
             shared.gradio['Copy last reply'].click(chat.send_last_reply_to_input, [], shared.gradio['textbox'], show_progress=shared.args.no_stream)
             shared.gradio['Replace last reply'].click(chat.replace_last_reply, [shared.gradio['textbox'], shared.gradio['name1'], shared.gradio['name2']], shared.gradio['display'], show_progress=shared.args.no_stream)

From d8e950d6bdf933f8a0cd78a0c7cb2a941b8d32e3 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 24 Mar 2023 16:30:32 -0300
Subject: [PATCH 25/42] Don't load the model twice when using --lora

---
 server.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/server.py b/server.py
index 7b25e91d..f423e368 100644
--- a/server.py
+++ b/server.py
@@ -233,9 +233,7 @@ else:
     shared.model_name = available_models[i]
 shared.model, shared.tokenizer = load_model(shared.model_name)
 if shared.args.lora:
-    print(shared.args.lora)
-    shared.lora_name = shared.args.lora
-    add_lora_to_model(shared.lora_name)
+    add_lora_to_model(shared.args.lora)
 
 # Default UI settings
 default_preset = shared.settings['presets'][next((k for k in shared.settings['presets'] if re.match(k.lower(), shared.model_name.lower())), 'default')]

From 8fad84abc2c8eb90718ef7d7084a22c740e20d9b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 24 Mar 2023 16:51:27 -0300
Subject: [PATCH 26/42] Update extensions.py

---
 modules/extensions.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/modules/extensions.py b/modules/extensions.py
index b363bc39..c55dc978 100644
--- a/modules/extensions.py
+++ b/modules/extensions.py
@@ -1,3 +1,5 @@
+import traceback
+
 import gradio as gr
 
 import extensions
@@ -18,6 +20,7 @@ def load_extensions():
                 print('Ok.')
             except:
                 print('Fail.')
+                traceback.print_exc()
 
 # This iterator returns the extensions in the order specified in the command-line
 def iterator():
@@ -38,6 +41,7 @@ def apply_extensions(text, typ):
 
 def create_extensions_block():
     global setup_called
+
     # Updating the default values
     for extension, name in iterator():
         if hasattr(extension, 'params'):
@@ -47,6 +51,7 @@ def create_extensions_block():
                     extension.params[param] = shared.settings[_id]
 
     should_display_ui = False
+
     # Running setup function
     if not setup_called:
         for extension, name in iterator():

From 4a724ed22fc8942677f44df39674b571450ea51c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 24 Mar 2023 16:53:56 -0300
Subject: [PATCH 27/42] Reorder imports

---
 extensions/api/script.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/extensions/api/script.py b/extensions/api/script.py
index bbd8551a..7783594c 100644
--- a/extensions/api/script.py
+++ b/extensions/api/script.py
@@ -1,8 +1,9 @@
+import json
 from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
 from threading import Thread
+
 from modules import shared
-from modules.text_generation import generate_reply, encode
-import json
+from modules.text_generation import encode, generate_reply
 
 params = {
     'port': 5000,
@@ -87,4 +88,4 @@ def run_server():
     server.serve_forever()
 
 def setup():
-    Thread(target=run_server, daemon=True).start()
\ No newline at end of file
+    Thread(target=run_server, daemon=True).start()

From ffb0187e83043ddbbc7ab1b29e843a1ee6107b54 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 24 Mar 2023 17:17:29 -0300
Subject: [PATCH 28/42] Update chat.py

---
 modules/chat.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index c1e55ac4..0dc5c922 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -34,12 +34,12 @@ def generate_chat_prompt(user_input, max_new_tokens, name1, name2, context, chat
     while i >= 0 and len(encode(''.join(rows), max_new_tokens)[0]) < max_length:
         rows.insert(1, f"{name2}: {shared.history['internal'][i][1].strip()}\n")
         prev_user_input = shared.history['internal'][i][0]
-        if prev_user_input and not shared.history['internal'][i][0] == '<|BEGIN-VISIBLE-CHAT|>':
-            rows.insert(1, f"{name1}: {shared.history['internal'][i][0].strip()}\n")
+        if len(prev_user_input) > 0 and prev_user_input != '<|BEGIN-VISIBLE-CHAT|>':
+            rows.insert(1, f"{name1}: {prev_user_input.strip()}\n")
         i -= 1
 
     if not impersonate:
-        if user_input:
+        if len(user_input) > 0:
             rows.append(f"{name1}: {user_input}\n")
         rows.append(apply_extensions(f"{name2}:", "bot_prefix"))
         limit = 3

From 6e1b16c2aa87d167ed9893e800c199408ba946d3 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 24 Mar 2023 17:18:27 -0300
Subject: [PATCH 29/42] Update html_generator.py

---
 modules/html_generator.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/modules/html_generator.py b/modules/html_generator.py
index f8cff6d8..ff18c913 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -119,13 +119,13 @@ def load_html_image(paths):
 
 def generate_chat_html(history, name1, name2, character):
     output = f'<style>{cai_css}</style><div class="chat" id="chat">'
-    
+
     img_bot = load_html_image([f"characters/{character}.{ext}" for ext in ['png', 'jpg', 'jpeg']] + ["img_bot.png","img_bot.jpg","img_bot.jpeg"])
     img_me = load_html_image(["img_me.png", "img_me.jpg", "img_me.jpeg"])
 
     for i,_row in enumerate(history[::-1]):
         row = [convert_to_markdown(entry) for entry in _row]
-        
+
         output += f"""
               <div class="message">
                 <div class="circle-bot">
@@ -142,8 +142,9 @@ def generate_chat_html(history, name1, name2, character):
               </div>
             """
 
-        if not row[0]:  # don't display empty user messages
+        if len(row[0]) == 0: # don't display empty user messages
             continue
+
         output += f"""
               <div class="message">
                 <div class="circle-you">

From a80aa65986ec159a5f9a198455b4f8f061b0d52f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 24 Mar 2023 19:53:20 -0300
Subject: [PATCH 30/42] Update models.py

---
 modules/models.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/models.py b/modules/models.py
index ccb97da3..c9f03588 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -44,7 +44,7 @@ def load_model(model_name):
     shared.is_RWKV = model_name.lower().startswith('rwkv-')
 
     # Default settings
-    if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.gptq_bits, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.is_RWKV]):
+    if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.wbits, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.is_RWKV]):
         if any(size in shared.model_name.lower() for size in ('13b', '20b', '30b')):
             model = AutoModelForCausalLM.from_pretrained(Path(f"models/{shared.model_name}"), device_map='auto', load_in_8bit=True)
         else:
@@ -95,7 +95,7 @@ def load_model(model_name):
         return model, tokenizer
 
     # Quantized model
-    elif shared.args.gptq_bits > 0:
+    elif shared.args.wbits > 0:
         from modules.GPTQ_loader import load_quantized
 
         model = load_quantized(model_name)

From 0a162244513b9d2946ad6ffff8b40c8d77e342ae Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 24 Mar 2023 19:54:36 -0300
Subject: [PATCH 31/42] Update GPTQ_loader.py

---
 modules/GPTQ_loader.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/modules/GPTQ_loader.py b/modules/GPTQ_loader.py
index 32a5458f..b58c8964 100644
--- a/modules/GPTQ_loader.py
+++ b/modules/GPTQ_loader.py
@@ -14,7 +14,7 @@ import opt
 
 
 def load_quantized(model_name):
-    if not shared.args.gptq_model_type:
+    if not shared.args.model_type:
         # Try to determine model type from model name
         model_type = model_name.split('-')[0].lower()
         if model_type not in ('llama', 'opt'):
@@ -22,10 +22,10 @@ def load_quantized(model_name):
                   "argument")
             exit()
     else:
-        model_type = shared.args.gptq_model_type.lower()
+        model_type = shared.args.model_type.lower()
 
     if model_type == 'llama':
-        if not shared.args.gptq_pre_layer:
+        if not shared.args.pre_layer:
             load_quant = llama.load_quant
         else:
             load_quant = llama_inference_offload.load_quant
@@ -37,15 +37,15 @@ def load_quantized(model_name):
 
     path_to_model = Path(f'models/{model_name}')
     if path_to_model.name.lower().startswith('llama-7b'):
-        pt_model = f'llama-7b-{shared.args.gptq_bits}bit.pt'
+        pt_model = f'llama-7b-{shared.args.wbits}bit.pt'
     elif path_to_model.name.lower().startswith('llama-13b'):
-        pt_model = f'llama-13b-{shared.args.gptq_bits}bit.pt'
+        pt_model = f'llama-13b-{shared.args.wbits}bit.pt'
     elif path_to_model.name.lower().startswith('llama-30b'):
-        pt_model = f'llama-30b-{shared.args.gptq_bits}bit.pt'
+        pt_model = f'llama-30b-{shared.args.wbits}bit.pt'
     elif path_to_model.name.lower().startswith('llama-65b'):
-        pt_model = f'llama-65b-{shared.args.gptq_bits}bit.pt'
+        pt_model = f'llama-65b-{shared.args.wbits}bit.pt'
     else:
-        pt_model = f'{model_name}-{shared.args.gptq_bits}bit.pt'
+        pt_model = f'{model_name}-{shared.args.wbits}bit.pt'
 
     # Try to find the .pt both in models/ and in the subfolder
     pt_path = None
@@ -58,10 +58,10 @@ def load_quantized(model_name):
         exit()
 
     # qwopqwop200's offload
-    if shared.args.gptq_pre_layer:
-        model = load_quant(str(path_to_model), str(pt_path), shared.args.gptq_bits, shared.args.gptq_pre_layer)
+    if shared.args.pre_layer:
+        model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.pre_layer)
     else:
-        model = load_quant(str(path_to_model), str(pt_path), shared.args.gptq_bits)
+        model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits)
 
         # accelerate offload (doesn't work properly)
         if shared.args.gpu_memory:

From a6bf54739c61ac230e94f95ff209004221efeb86 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 24 Mar 2023 19:56:45 -0300
Subject: [PATCH 32/42] Revert models.py (accident)

---
 modules/models.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/models.py b/modules/models.py
index c9f03588..ccb97da3 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -44,7 +44,7 @@ def load_model(model_name):
     shared.is_RWKV = model_name.lower().startswith('rwkv-')
 
     # Default settings
-    if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.wbits, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.is_RWKV]):
+    if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.gptq_bits, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.is_RWKV]):
         if any(size in shared.model_name.lower() for size in ('13b', '20b', '30b')):
             model = AutoModelForCausalLM.from_pretrained(Path(f"models/{shared.model_name}"), device_map='auto', load_in_8bit=True)
         else:
@@ -95,7 +95,7 @@ def load_model(model_name):
         return model, tokenizer
 
     # Quantized model
-    elif shared.args.wbits > 0:
+    elif shared.args.gptq_bits > 0:
         from modules.GPTQ_loader import load_quantized
 
         model = load_quantized(model_name)

From 9fa47c0eed275f64a16c01bea7df2c9aba16f13b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 24 Mar 2023 19:57:12 -0300
Subject: [PATCH 33/42] Revert GPTQ_loader.py (accident)

---
 modules/GPTQ_loader.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/modules/GPTQ_loader.py b/modules/GPTQ_loader.py
index b58c8964..32a5458f 100644
--- a/modules/GPTQ_loader.py
+++ b/modules/GPTQ_loader.py
@@ -14,7 +14,7 @@ import opt
 
 
 def load_quantized(model_name):
-    if not shared.args.model_type:
+    if not shared.args.gptq_model_type:
         # Try to determine model type from model name
         model_type = model_name.split('-')[0].lower()
         if model_type not in ('llama', 'opt'):
@@ -22,10 +22,10 @@ def load_quantized(model_name):
                   "argument")
             exit()
     else:
-        model_type = shared.args.model_type.lower()
+        model_type = shared.args.gptq_model_type.lower()
 
     if model_type == 'llama':
-        if not shared.args.pre_layer:
+        if not shared.args.gptq_pre_layer:
             load_quant = llama.load_quant
         else:
             load_quant = llama_inference_offload.load_quant
@@ -37,15 +37,15 @@ def load_quantized(model_name):
 
     path_to_model = Path(f'models/{model_name}')
     if path_to_model.name.lower().startswith('llama-7b'):
-        pt_model = f'llama-7b-{shared.args.wbits}bit.pt'
+        pt_model = f'llama-7b-{shared.args.gptq_bits}bit.pt'
     elif path_to_model.name.lower().startswith('llama-13b'):
-        pt_model = f'llama-13b-{shared.args.wbits}bit.pt'
+        pt_model = f'llama-13b-{shared.args.gptq_bits}bit.pt'
     elif path_to_model.name.lower().startswith('llama-30b'):
-        pt_model = f'llama-30b-{shared.args.wbits}bit.pt'
+        pt_model = f'llama-30b-{shared.args.gptq_bits}bit.pt'
     elif path_to_model.name.lower().startswith('llama-65b'):
-        pt_model = f'llama-65b-{shared.args.wbits}bit.pt'
+        pt_model = f'llama-65b-{shared.args.gptq_bits}bit.pt'
     else:
-        pt_model = f'{model_name}-{shared.args.wbits}bit.pt'
+        pt_model = f'{model_name}-{shared.args.gptq_bits}bit.pt'
 
     # Try to find the .pt both in models/ and in the subfolder
     pt_path = None
@@ -58,10 +58,10 @@ def load_quantized(model_name):
         exit()
 
     # qwopqwop200's offload
-    if shared.args.pre_layer:
-        model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.pre_layer)
+    if shared.args.gptq_pre_layer:
+        model = load_quant(str(path_to_model), str(pt_path), shared.args.gptq_bits, shared.args.gptq_pre_layer)
     else:
-        model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits)
+        model = load_quant(str(path_to_model), str(pt_path), shared.args.gptq_bits)
 
         # accelerate offload (doesn't work properly)
         if shared.args.gpu_memory:

From 25be9698c74d7af950cbcbf8ec4c0cd9bebc6d3c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 25 Mar 2023 01:18:32 -0300
Subject: [PATCH 34/42] Fix LoRA on mps

---
 modules/LoRA.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/modules/LoRA.py b/modules/LoRA.py
index aa68ad32..283fcf4c 100644
--- a/modules/LoRA.py
+++ b/modules/LoRA.py
@@ -1,5 +1,7 @@
 from pathlib import Path
 
+import torch
+
 import modules.shared as shared
 from modules.models import load_model
 from modules.text_generation import clear_torch_cache
@@ -34,4 +36,8 @@ def add_lora_to_model(lora_name):
         if not shared.args.load_in_8bit and not shared.args.cpu:
             shared.model.half()
             if not hasattr(shared.model, "hf_device_map"):
-                shared.model.cuda()
+                if torch.has_mps:
+                    device = torch.device('mps')
+                    shared.model = shared.model.to(device)
+                else:
+                    shared.model = shared.model.cuda()

From 70f9565f37c47be34d4bdbabe3c874bc4c4c7039 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 25 Mar 2023 02:35:30 -0300
Subject: [PATCH 35/42] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 4e4959ac..60444401 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 * [FlexGen offload](https://github.com/oobabooga/text-generation-webui/wiki/FlexGen).
 * [DeepSpeed ZeRO-3 offload](https://github.com/oobabooga/text-generation-webui/wiki/DeepSpeed).
 * Get responses via API, [with](https://github.com/oobabooga/text-generation-webui/blob/main/api-example-streaming.py) or [without](https://github.com/oobabooga/text-generation-webui/blob/main/api-example.py) streaming.
-* [LLaMA model, including 4-bit mode](https://github.com/oobabooga/text-generation-webui/wiki/LLaMA-model).
+* [LLaMA model, including 4-bit GPTQ support](https://github.com/oobabooga/text-generation-webui/wiki/LLaMA-model).
 * [RWKV model](https://github.com/oobabooga/text-generation-webui/wiki/RWKV-model).
 * [Supports LoRAs](https://github.com/oobabooga/text-generation-webui/wiki/Using-LoRAs).
 * Supports softprompts.

From 8c8e8b44508972a37fd15d760f9e4214e5105306 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 25 Mar 2023 12:35:52 -0300
Subject: [PATCH 36/42] Fix the early stopping callback #559

---
 modules/callbacks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/callbacks.py b/modules/callbacks.py
index 2ae9d908..8d30d615 100644
--- a/modules/callbacks.py
+++ b/modules/callbacks.py
@@ -25,7 +25,7 @@ class _SentinelTokenStoppingCriteria(transformers.StoppingCriteria):
                 if trimmed_sample.shape[-1] < self.sentinel_token_ids[i].shape[-1]:
                     continue
                 for window in trimmed_sample.unfold(0, self.sentinel_token_ids[i].shape[-1], 1):
-                    if torch.all(torch.eq(self.sentinel_token_ids[i], window)):
+                    if torch.all(torch.eq(self.sentinel_token_ids[i][0], window)):
                         return True
         return False
 

From 9ccf505ccd8484a6af6bb954ce1deea6ce035b05 Mon Sep 17 00:00:00 2001
From: "Alex \"mcmonkey\" Goodwin" <git_commits@alexgoodwin.dev>
Date: Sat, 25 Mar 2023 10:04:00 -0700
Subject: [PATCH 37/42] improve/simplify gitignore

- add repositories
- remove the redundant "/*" on folders
- remove the exclusions for files that already exist
---
 .gitignore | 28 +++++++++++-----------------
 1 file changed, 11 insertions(+), 17 deletions(-)

diff --git a/.gitignore b/.gitignore
index 3cfbbb22..00198b8f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,26 +1,20 @@
-cache/*
-characters/*
-extensions/silero_tts/outputs/*
-extensions/elevenlabs_tts/outputs/*
-extensions/sd_api_pictures/outputs/*
-logs/*
-loras/*
-models/*
-softprompts/*
-torch-dumps/*
+cache
+characters
+extensions/silero_tts/outputs
+extensions/elevenlabs_tts/outputs
+extensions/sd_api_pictures/outputs
+logs
+loras
+models
+softprompts
+torch-dumps
 *pycache*
 */*pycache*
 */*/pycache*
 venv/
 .venv/
+repositories
 
 settings.json
 img_bot*
 img_me*
-
-!characters/Example.json
-!characters/Example.png
-!loras/place-your-loras-here.txt
-!models/place-your-models-here.txt
-!softprompts/place-your-softprompts-here.txt
-!torch-dumps/place-your-pt-models-here.txt

From 8134c4b334ecc3a6a30e774a7265cbafc42ac6cc Mon Sep 17 00:00:00 2001
From: "Alex \"mcmonkey\" Goodwin" <git_commits@alexgoodwin.dev>
Date: Sat, 25 Mar 2023 12:41:18 -0700
Subject: [PATCH 38/42] add training/datsets to gitignore for #570

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 00198b8f..36852916 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 cache
 characters
+training/datasets
 extensions/silero_tts/outputs
 extensions/elevenlabs_tts/outputs
 extensions/sd_api_pictures/outputs

From 49c10c5570b595e9d4fdcb496c456a9982ede070 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 26 Mar 2023 00:11:33 -0300
Subject: [PATCH 39/42] Add support for the latest GPTQ models with group-size
 (#530)

**Warning: old 4-bit weights will not work anymore!**

See here how to get up to date weights: https://github.com/oobabooga/text-generation-webui/wiki/LLaMA-model#step-2-get-the-pre-converted-weights
---
 README.md              |  8 ++---
 modules/GPTQ_loader.py | 66 +++++++++++++++++++++++++-----------------
 modules/models.py      |  4 +--
 modules/shared.py      | 24 ++++++++++-----
 server.py              |  5 ++--
 5 files changed, 64 insertions(+), 43 deletions(-)

diff --git a/README.md b/README.md
index 60444401..3bfbc72f 100644
--- a/README.md
+++ b/README.md
@@ -176,10 +176,10 @@ Optionally, you can use the following command-line flags:
 | `--cai-chat`     | Launch the web UI in chat mode with a style similar to Character.AI's. If the file `img_bot.png` or `img_bot.jpg` exists in the same folder as server.py, this image will be used as the bot's profile picture. Similarly, `img_me.png` or `img_me.jpg` will be used as your profile picture. |
 | `--cpu`          | Use the CPU to generate text.|
 | `--load-in-8bit` | Load the model with 8-bit precision.|
-| `--load-in-4bit` | DEPRECATED: use `--gptq-bits 4` instead. |
-| `--gptq-bits GPTQ_BITS` |  GPTQ: Load a pre-quantized model with specified precision. 2, 3, 4 and 8 (bit) are supported. Currently only works with LLaMA and OPT. |
-| `--gptq-model-type MODEL_TYPE` |  GPTQ: Model type of pre-quantized model. Currently only LLaMa and OPT are supported. |
-| `--gptq-pre-layer GPTQ_PRE_LAYER` |  GPTQ: The number of layers to preload. |
+| `--wbits WBITS`            | GPTQ: Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported. |
+| `--model_type MODEL_TYPE`  | GPTQ: Model type of pre-quantized model. Currently only LLaMA and OPT are supported. |
+| `--groupsize GROUPSIZE`    | GPTQ: Group size. |
+| `--pre_layer PRE_LAYER`    | GPTQ: The number of layers to preload. |
 | `--bf16`         | Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU. |
 | `--auto-devices` | Automatically split the model across the available GPU(s) and CPU.|
 | `--disk`         | If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk. |
diff --git a/modules/GPTQ_loader.py b/modules/GPTQ_loader.py
index bec6c66f..afb5695f 100644
--- a/modules/GPTQ_loader.py
+++ b/modules/GPTQ_loader.py
@@ -14,18 +14,21 @@ import opt
 
 
 def load_quantized(model_name):
-    if not shared.args.gptq_model_type:
+    if not shared.args.model_type:
         # Try to determine model type from model name
-        model_type = model_name.split('-')[0].lower()
-        if model_type not in ('llama', 'opt'):
-            print("Can't determine model type from model name. Please specify it manually using --gptq-model-type "
+        if model_name.lower().startswith(('llama', 'alpaca')):
+            model_type = 'llama'
+        elif model_name.lower().startswith(('opt', 'galactica')):
+            model_type = 'opt'
+        else:
+            print("Can't determine model type from model name. Please specify it manually using --model_type "
                   "argument")
             exit()
     else:
-        model_type = shared.args.gptq_model_type.lower()
+        model_type = shared.args.model_type.lower()
 
     if model_type == 'llama':
-        if not shared.args.gptq_pre_layer:
+        if not shared.args.pre_layer:
             load_quant = llama.load_quant
         else:
             load_quant = llama_inference_offload.load_quant
@@ -35,35 +38,44 @@ def load_quantized(model_name):
         print("Unknown pre-quantized model type specified. Only 'llama' and 'opt' are supported")
         exit()
 
+    # Now we are going to try to locate the quantized model file.
     path_to_model = Path(f'models/{model_name}')
-    if path_to_model.name.lower().startswith('llama-7b'):
-        pt_model = f'llama-7b-{shared.args.gptq_bits}bit'
-    elif path_to_model.name.lower().startswith('llama-13b'):
-        pt_model = f'llama-13b-{shared.args.gptq_bits}bit'
-    elif path_to_model.name.lower().startswith('llama-30b'):
-        pt_model = f'llama-30b-{shared.args.gptq_bits}bit'
-    elif path_to_model.name.lower().startswith('llama-65b'):
-        pt_model = f'llama-65b-{shared.args.gptq_bits}bit'
-    else:
-        pt_model = f'{model_name}-{shared.args.gptq_bits}bit'
-
-    # Try to find the .safetensors or .pt both in models/ and in the subfolder
+    found_pts = list(path_to_model.glob("*.pt"))
+    found_safetensors = list(path_to_model.glob("*.safetensors"))
     pt_path = None
-    for path in [Path(p+ext) for ext in ['.safetensors', '.pt'] for p in [f"models/{pt_model}", f"{path_to_model}/{pt_model}"]]:
-        if path.exists():
-            print(f"Found {path}")
-            pt_path = path
-            break
+
+    if len(found_pts) == 1:
+        pt_path = found_pts[0]
+    elif len(found_safetensors) == 1:
+        pt_path = found_safetensors[0]
+    else:
+        if path_to_model.name.lower().startswith('llama-7b'):
+            pt_model = f'llama-7b-{shared.args.wbits}bit'
+        elif path_to_model.name.lower().startswith('llama-13b'):
+            pt_model = f'llama-13b-{shared.args.wbits}bit'
+        elif path_to_model.name.lower().startswith('llama-30b'):
+            pt_model = f'llama-30b-{shared.args.wbits}bit'
+        elif path_to_model.name.lower().startswith('llama-65b'):
+            pt_model = f'llama-65b-{shared.args.wbits}bit'
+        else:
+            pt_model = f'{model_name}-{shared.args.wbits}bit'
+
+        # Try to find the .safetensors or .pt both in models/ and in the subfolder
+        for path in [Path(p+ext) for ext in ['.safetensors', '.pt'] for p in [f"models/{pt_model}", f"{path_to_model}/{pt_model}"]]:
+            if path.exists():
+                print(f"Found {path}")
+                pt_path = path
+                break
 
     if not pt_path:
-        print(f"Could not find {pt_model}, exiting...")
+        print("Could not find the quantized model in .pt or .safetensors format, exiting...")
         exit()
 
     # qwopqwop200's offload
-    if shared.args.gptq_pre_layer:
-        model = load_quant(str(path_to_model), str(pt_path), shared.args.gptq_bits, shared.args.gptq_pre_layer)
+    if shared.args.pre_layer:
+        model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.groupsize, shared.args.pre_layer)
     else:
-        model = load_quant(str(path_to_model), str(pt_path), shared.args.gptq_bits)
+        model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.groupsize)
 
         # accelerate offload (doesn't work properly)
         if shared.args.gpu_memory:
diff --git a/modules/models.py b/modules/models.py
index ccb97da3..c9f03588 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -44,7 +44,7 @@ def load_model(model_name):
     shared.is_RWKV = model_name.lower().startswith('rwkv-')
 
     # Default settings
-    if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.gptq_bits, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.is_RWKV]):
+    if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.wbits, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.is_RWKV]):
         if any(size in shared.model_name.lower() for size in ('13b', '20b', '30b')):
             model = AutoModelForCausalLM.from_pretrained(Path(f"models/{shared.model_name}"), device_map='auto', load_in_8bit=True)
         else:
@@ -95,7 +95,7 @@ def load_model(model_name):
         return model, tokenizer
 
     # Quantized model
-    elif shared.args.gptq_bits > 0:
+    elif shared.args.wbits > 0:
         from modules.GPTQ_loader import load_quantized
 
         model = load_quantized(model_name)
diff --git a/modules/shared.py b/modules/shared.py
index 720c697e..87896faf 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -52,7 +52,8 @@ settings = {
         'default': 'Common sense questions and answers\n\nQuestion: \nFactual answer:',
         '^(gpt4chan|gpt-4chan|4chan)': '-----\n--- 865467536\nInput text\n--- 865467537\n',
         '(rosey|chip|joi)_.*_instruct.*': 'User: \n',
-        'oasst-*': '<|prompter|>Write a story about future of AI development<|endoftext|><|assistant|>'
+        'oasst-*': '<|prompter|>Write a story about future of AI development<|endoftext|><|assistant|>',
+        'alpaca-*': "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n### Instruction:\nWrite a poem about the transformers Python library. \nMention the word \"large language models\" in that poem.\n### Response:\n",
     },
     'lora_prompts': {
         'default': 'Common sense questions and answers\n\nQuestion: \nFactual answer:',
@@ -78,10 +79,15 @@ parser.add_argument('--chat', action='store_true', help='Launch the web UI in ch
 parser.add_argument('--cai-chat', action='store_true', help='Launch the web UI in chat mode with a style similar to Character.AI\'s. If the file img_bot.png or img_bot.jpg exists in the same folder as server.py, this image will be used as the bot\'s profile picture. Similarly, img_me.png or img_me.jpg will be used as your profile picture.')
 parser.add_argument('--cpu', action='store_true', help='Use the CPU to generate text.')
 parser.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision.')
-parser.add_argument('--load-in-4bit', action='store_true', help='DEPRECATED: use --gptq-bits 4 instead.')
-parser.add_argument('--gptq-bits', type=int, default=0, help='GPTQ: Load a pre-quantized model with specified precision. 2, 3, 4 and 8bit are supported. Currently only works with LLaMA and OPT.')
-parser.add_argument('--gptq-model-type', type=str, help='GPTQ: Model type of pre-quantized model. Currently only LLaMa and OPT are supported.')
-parser.add_argument('--gptq-pre-layer', type=int, default=0, help='GPTQ: The number of layers to preload.')
+
+parser.add_argument('--gptq-bits', type=int, default=0, help='DEPRECATED: use --wbits instead.')
+parser.add_argument('--gptq-model-type', type=str, help='DEPRECATED: use --model_type instead.')
+parser.add_argument('--gptq-pre-layer', type=int, default=0, help='DEPRECATED: use --pre_layer instead.')
+parser.add_argument('--wbits', type=int, default=0, help='GPTQ: Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.')
+parser.add_argument('--model_type', type=str, help='GPTQ: Model type of pre-quantized model. Currently only LLaMA and OPT are supported.')
+parser.add_argument('--groupsize', type=int, default=-1, help='GPTQ: Group size.')
+parser.add_argument('--pre_layer', type=int, default=0, help='GPTQ: The number of layers to preload.')
+
 parser.add_argument('--bf16', action='store_true', help='Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.')
 parser.add_argument('--auto-devices', action='store_true', help='Automatically split the model across the available GPU(s) and CPU.')
 parser.add_argument('--disk', action='store_true', help='If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.')
@@ -109,6 +115,8 @@ parser.add_argument('--verbose', action='store_true', help='Print the prompts to
 args = parser.parse_args()
 
 # Provisional, this will be deleted later
-if args.load_in_4bit:
-    print("Warning: --load-in-4bit is deprecated and will be removed. Use --gptq-bits 4 instead.\n")
-    args.gptq_bits = 4
+deprecated_dict = {'gptq_bits': ['wbits', 0], 'gptq_model_type': ['model_type', None], 'gptq_pre_layer': ['prelayer', 0]}
+for k in deprecated_dict:
+    if eval(f"args.{k}") != deprecated_dict[k][1]:
+        print(f"Warning: --{k} is deprecated and will be removed. Use --{deprecated_dict[k][0]} instead.")
+        exec(f"args.{deprecated_dict[k][0]} = args.{k}")
diff --git a/server.py b/server.py
index f423e368..f1b95a5b 100644
--- a/server.py
+++ b/server.py
@@ -237,8 +237,9 @@ if shared.args.lora:
 
 # Default UI settings
 default_preset = shared.settings['presets'][next((k for k in shared.settings['presets'] if re.match(k.lower(), shared.model_name.lower())), 'default')]
-default_text = shared.settings['lora_prompts'][next((k for k in shared.settings['lora_prompts'] if re.match(k.lower(), shared.lora_name.lower())), 'default')]
-if default_text == '':
+if shared.lora_name != "None":
+    default_text = shared.settings['lora_prompts'][next((k for k in shared.settings['lora_prompts'] if re.match(k.lower(), shared.lora_name.lower())), 'default')]
+else:
     default_text = shared.settings['prompts'][next((k for k in shared.settings['prompts'] if re.match(k.lower(), shared.model_name.lower())), 'default')]
 title ='Text generation web UI'
 description = '\n\n# Text generation lab\nGenerate text using Large Language Models.\n'

From 19174842b82505c4627f095ea910ea0a9c998e1c Mon Sep 17 00:00:00 2001
From: Florian Kusche <git@k1k.eu>
Date: Sun, 26 Mar 2023 19:41:14 +0200
Subject: [PATCH 40/42] Also download Markdown files

---
 download-model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/download-model.py b/download-model.py
index 7ca33b7d..25386e5f 100644
--- a/download-model.py
+++ b/download-model.py
@@ -118,7 +118,7 @@ def get_download_links_from_huggingface(model, branch):
             is_safetensors = re.match("model.*\.safetensors", fname)
             is_pt = re.match(".*\.pt", fname)
             is_tokenizer = re.match("tokenizer.*\.model", fname)
-            is_text = re.match(".*\.(txt|json|py)", fname) or is_tokenizer
+            is_text = re.match(".*\.(txt|json|py|md)", fname) or is_tokenizer
 
             if any((is_pytorch, is_safetensors, is_pt, is_tokenizer, is_text)):
                 if is_text:

From 9ff6a538b6055b6845efd2f0e625386a847945eb Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 26 Mar 2023 22:11:19 -0300
Subject: [PATCH 41/42] Bump gradio version

Make sure to upgrade with

`pip install -r requirements.txt --upgrade`
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index e5b3de69..c84f2948 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 accelerate==0.17.1
 bitsandbytes==0.37.1
 flexgen==0.1.7
-gradio==3.18.0
+gradio==3.23.0
 markdown
 numpy
 peft==0.2.0

From 1c77fdca4cdfca5c636595a8aaaff3281b859d3a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 26 Mar 2023 22:20:30 -0300
Subject: [PATCH 42/42] Change notebook mode appearance

---
 css/chat.css          |  6 ++++++
 css/main.css          | 10 ++++++++++
 css/main.js           |  2 +-
 modules/extensions.py |  4 ++--
 server.py             | 25 ++++++++++++++-----------
 5 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/css/chat.css b/css/chat.css
index 8d9d88a6..1e703530 100644
--- a/css/chat.css
+++ b/css/chat.css
@@ -23,3 +23,9 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 .pending.svelte-1ed2p3z {
     opacity: 1;
 }
+
+#extensions {
+  padding: 0;
+  padding: 0;
+}
+
diff --git a/css/main.css b/css/main.css
index 09f3b6a8..97879f01 100644
--- a/css/main.css
+++ b/css/main.css
@@ -54,3 +54,13 @@ ol li p, ul li p {
 .gradio-container-3-18-0 .prose * h1, h2, h3, h4 {
   color: white;
 }
+
+.gradio-container {
+  max-width: 100% !important;
+  padding-top: 0 !important;
+}
+
+#extensions {
+  padding: 15px;
+  padding: 15px;
+}
diff --git a/css/main.js b/css/main.js
index 9db3fe8b..029ecb62 100644
--- a/css/main.js
+++ b/css/main.js
@@ -11,7 +11,7 @@ let extensions = document.getElementById('extensions');
 main_parent.addEventListener('click', function(e) {
     // Check if the main element is visible
     if (main.offsetHeight > 0 && main.offsetWidth > 0) {
-        extensions.style.display = 'block';
+        extensions.style.display = 'flex';
     } else {
         extensions.style.display = 'none';
     }
diff --git a/modules/extensions.py b/modules/extensions.py
index c55dc978..c3cf4de4 100644
--- a/modules/extensions.py
+++ b/modules/extensions.py
@@ -63,8 +63,8 @@ def create_extensions_block():
 
     # Creating the extension ui elements
     if should_display_ui:
-        with gr.Box(elem_id="extensions"):
-            gr.Markdown("Extensions")
+        with gr.Column(elem_id="extensions"):
             for extension, name in iterator():
+                gr.Markdown(f"\n### {name}")
                 if hasattr(extension, "ui"):
                     extension.ui()
diff --git a/server.py b/server.py
index f1b95a5b..56bb499d 100644
--- a/server.py
+++ b/server.py
@@ -369,19 +369,22 @@ def create_interface():
 
         elif shared.args.notebook:
             with gr.Tab("Text generation", elem_id="main"):
-                with gr.Tab('Raw'):
-                    shared.gradio['textbox'] = gr.Textbox(value=default_text, lines=25)
-                with gr.Tab('Markdown'):
-                    shared.gradio['markdown'] = gr.Markdown()
-                with gr.Tab('HTML'):
-                    shared.gradio['html'] = gr.HTML()
-
                 with gr.Row():
-                    shared.gradio['Stop'] = gr.Button('Stop')
-                    shared.gradio['Generate'] = gr.Button('Generate')
-                shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], step=1, label='max_new_tokens', value=shared.settings['max_new_tokens'])
+                    with gr.Column(scale=4):
+                        with gr.Tab('Raw'):
+                            shared.gradio['textbox'] = gr.Textbox(value=default_text, elem_id="textbox", lines=25)
+                        with gr.Tab('Markdown'):
+                            shared.gradio['markdown'] = gr.Markdown()
+                        with gr.Tab('HTML'):
+                            shared.gradio['html'] = gr.HTML()
 
-                create_model_and_preset_menus()
+                        with gr.Row():
+                            shared.gradio['Stop'] = gr.Button('Stop')
+                            shared.gradio['Generate'] = gr.Button('Generate')
+                    with gr.Column(scale=1):
+                        shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], step=1, label='max_new_tokens', value=shared.settings['max_new_tokens'])
+
+                        create_model_and_preset_menus()
             with gr.Tab("Parameters", elem_id="parameters"):
                 create_settings_menus(default_preset)