From 6b546a2c8bf9faa23c81056ac575ff8f7406945e Mon Sep 17 00:00:00 2001
From: chr <chraac@gmail.com>
Date: Mon, 20 May 2024 07:02:19 +0800
Subject: [PATCH 01/64] llama.cpp: increase the max threads from 32 to 256
 (#5889)

---
 modules/ui_model_menu.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 52c59463..fe0d3ee1 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -97,8 +97,8 @@ def create_ui():
                             shared.gradio['n_ctx'] = gr.Slider(minimum=0, maximum=shared.settings['truncation_length_max'], step=256, label="n_ctx", value=shared.args.n_ctx, info='Context length. Try lowering this if you run out of memory while loading the model.')
                             shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 18,17')
                             shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, step=1, value=shared.args.n_batch)
-                            shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=32, value=shared.args.threads)
-                            shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=32, value=shared.args.threads_batch)
+                            shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=256, value=shared.args.threads)
+                            shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
                             shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=shared.args.wbits if shared.args.wbits > 0 else "None")
                             shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=shared.args.groupsize if shared.args.groupsize > 0 else "None")
                             shared.gradio['model_type'] = gr.Dropdown(label="model_type", choices=["None"], value=shared.args.model_type or "None")

From 2de586f58695e33ce02250d815ad12dfe21342dd Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sun, 19 May 2024 20:03:18 -0300
Subject: [PATCH 02/64] Update accelerate requirement from ==0.27.* to ==0.30.*
 (#5989)

---
 requirements.txt                 | 2 +-
 requirements_amd.txt             | 2 +-
 requirements_amd_noavx2.txt      | 2 +-
 requirements_apple_intel.txt     | 2 +-
 requirements_apple_silicon.txt   | 2 +-
 requirements_cpu_only.txt        | 2 +-
 requirements_cpu_only_noavx2.txt | 2 +-
 requirements_noavx2.txt          | 2 +-
 requirements_nowheels.txt        | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index b71f5d22..65b14570 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-accelerate==0.27.*
+accelerate==0.30.*
 aqlm[gpu,cpu]==1.1.3; platform_system == "Linux"
 bitsandbytes==0.43.*
 colorama
diff --git a/requirements_amd.txt b/requirements_amd.txt
index 967acc52..5231460e 100644
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@@ -1,4 +1,4 @@
-accelerate==0.27.*
+accelerate==0.30.*
 colorama
 datasets
 einops
diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt
index 40e55880..93ac6288 100644
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@@ -1,4 +1,4 @@
-accelerate==0.27.*
+accelerate==0.30.*
 colorama
 datasets
 einops
diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt
index 92707892..07726ec5 100644
--- a/requirements_apple_intel.txt
+++ b/requirements_apple_intel.txt
@@ -1,4 +1,4 @@
-accelerate==0.27.*
+accelerate==0.30.*
 colorama
 datasets
 einops
diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt
index 0d7d7d8d..bab40944 100644
--- a/requirements_apple_silicon.txt
+++ b/requirements_apple_silicon.txt
@@ -1,4 +1,4 @@
-accelerate==0.27.*
+accelerate==0.30.*
 colorama
 datasets
 einops
diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt
index b4cbf127..a102067b 100644
--- a/requirements_cpu_only.txt
+++ b/requirements_cpu_only.txt
@@ -1,4 +1,4 @@
-accelerate==0.27.*
+accelerate==0.30.*
 colorama
 datasets
 einops
diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt
index 75387b7d..24d3633a 100644
--- a/requirements_cpu_only_noavx2.txt
+++ b/requirements_cpu_only_noavx2.txt
@@ -1,4 +1,4 @@
-accelerate==0.27.*
+accelerate==0.30.*
 colorama
 datasets
 einops
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index aab0b970..aea0607b 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -1,4 +1,4 @@
-accelerate==0.27.*
+accelerate==0.30.*
 aqlm[gpu,cpu]==1.1.3; platform_system == "Linux"
 bitsandbytes==0.43.*
 colorama
diff --git a/requirements_nowheels.txt b/requirements_nowheels.txt
index 8da69bd2..135602fe 100644
--- a/requirements_nowheels.txt
+++ b/requirements_nowheels.txt
@@ -1,4 +1,4 @@
-accelerate==0.27.*
+accelerate==0.30.*
 colorama
 datasets
 einops

From b63dc4e3252de4a85c1c1ac5cbfb24229b938cab Mon Sep 17 00:00:00 2001
From: Samuel Wein <sam@samwein.com>
Date: Mon, 20 May 2024 01:05:17 +0200
Subject: [PATCH 03/64] UI: Warn user if they are trying to load a model from
 no path (#6006)

---
 modules/ui_model_menu.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index fe0d3ee1..9a936b0a 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -273,6 +273,10 @@ def load_lora_wrapper(selected_loras):
 
 def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), return_links=False, check=False):
     try:
+        if repo_id == "":
+            yield ("Please enter a model path")
+            return
+
         downloader = importlib.import_module("download-model").ModelDownloader()
 
         progress(0.0)

From 8456d133492b2f892ffbc7e55007fad950318825 Mon Sep 17 00:00:00 2001
From: Jari Van Melckebeke <vmjari@hotmail.com>
Date: Mon, 20 May 2024 01:09:37 +0200
Subject: [PATCH 04/64] [docs] small docker changes  (#5917)

---
 docs/09 - Docker.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/09 - Docker.md b/docs/09 - Docker.md
index 406abcbf..eec8fafd 100644
--- a/docs/09 - Docker.md	
+++ b/docs/09 - Docker.md	
@@ -19,7 +19,7 @@ Use these commands to launch the image:
 
 ```
 cd text-generation-webui
-ln -s docker/{nvidia/Dockerfile,docker-compose.yml,.dockerignore} .
+ln -s docker/{nvidia/Dockerfile,nvidia/docker-compose.yml,.dockerignore} .
 cp docker/.env.example .env
 # Edit .env and set TORCH_CUDA_ARCH_LIST based on your GPU model
 docker compose up --build

From 5cb59707f316102967ca05dc869801fbe510ff51 Mon Sep 17 00:00:00 2001
From: A0nameless0man <1395943920@qq.com>
Date: Mon, 20 May 2024 07:10:39 +0800
Subject: [PATCH 05/64] fix: grammar not support utf-8 (#5900)

---
 modules/grammar/grammar_utils.py | 51 +++++++++++++++++++-------------
 1 file changed, 31 insertions(+), 20 deletions(-)

diff --git a/modules/grammar/grammar_utils.py b/modules/grammar/grammar_utils.py
index 4b37c24a..140748d5 100644
--- a/modules/grammar/grammar_utils.py
+++ b/modules/grammar/grammar_utils.py
@@ -60,7 +60,7 @@ def hex_to_int(c):
         return int(c)
     elif "a" <= c.lower() <= "f":
         return ord(c.lower()) - ord("a") + 10
-    return -1
+    raise RuntimeError("unknown hex char " + c)
 
 
 def remove_leading_white_space(src, newline_ok):
@@ -100,6 +100,13 @@ def parse_name(src):
     return src[:pos], src[pos:]
 
 
+def read_hex(s):
+    val = 0
+    for c in s:
+        val = (val << 4) + hex_to_int(c)
+    return chr(val)
+
+
 def parse_char(src):
     """
     parse the leading char from the input string
@@ -111,13 +118,12 @@ def parse_char(src):
     if src[0] == "\\":
         esc = src[1]
         if esc == "x":
-            first = hex_to_int(src[2])
-            if first > -1:
-                second = hex_to_int(src[3])
-                if second > -1:
-                    return (first << 4) + second, src[4:]
-            raise RuntimeError("expecting \\xNN at " + src)
-        elif esc in ('"', "[", "]"):
+            return read_hex(src[2:4]), src[4:]
+        elif esc == "u":
+            return read_hex(src[2:6]), src[6:]
+        elif esc == "U":
+            return read_hex(src[2:10]), src[10:]
+        elif esc in ('"', "[", "]", "\\", "-"):
             return esc, src[2:]
         elif esc == "r":
             return "\r", src[2:]
@@ -454,7 +460,8 @@ class IncrementalGrammarConstraint(GrammarConstraint):
     def __init__(self, grammar_str, start_rule_name, tokenizer):
         super().__init__(grammar_str, start_rule_name, tokenizer)
 
-    def accept_char(self, byte, stacks):
+    def accept_char(self, char, stacks):
+        byte = ord(char)
         new_stacks = []
         for stack in stacks:
             # stack is empty
@@ -471,6 +478,9 @@ class IncrementalGrammarConstraint(GrammarConstraint):
                 if self.grammar_encoding[pos + i] <= byte and byte <= self.grammar_encoding[pos + i + 1]:
                     found = True
                     break
+                if self.grammar_encoding[pos + i] >= byte and byte >= self.grammar_encoding[pos + i + 1]:
+                    found = True
+                    break
             if not found:
                 continue
 
@@ -483,9 +493,8 @@ class IncrementalGrammarConstraint(GrammarConstraint):
         return new_stacks
 
     def accept_string(self, string: str, stacks: List[List[int]]):
-        _bytes = bytes(string, "utf-8")
-        for byte in _bytes:
-            stacks = self.accept_char(byte, stacks)
+        for char in string:
+            stacks = self.accept_char(char, stacks)
         return stacks
 
     def accept_token_id(self, token_id: int, stacks: List[List[int]]):
@@ -537,16 +546,18 @@ class IncrementalGrammarConstraint(GrammarConstraint):
 
     # For each sub-rule in the grammar, cache whether each byte is accepted.
     @lru_cache(maxsize=None)
-    def pos_char_acceptance(self, pos):
-        acceptance = [False] * 256
+    def pos_char_acceptance(self, pos, char):
+        byte = ord(char)
         num_chars = self.grammar_encoding[pos]
         pos += 1
         for i in range(0, num_chars, 2):
             start = self.grammar_encoding[pos + i]
             end = self.grammar_encoding[pos + i + 1]
-            for j in range(start, end + 1):
-                acceptance[j] = True
-        return acceptance
+            if byte >= start and byte <= end:
+                return True
+            if byte <= start and byte >= end:
+                return True
+        return False
 
     # Probably this should be configurable. If the grammar has an exceedingly
     # large number of states, the correct setting is a tradeoff between GPU
@@ -580,7 +591,7 @@ class IncrementalGrammarConstraint(GrammarConstraint):
                     pos = stk[-1]
                     num_chars = self.grammar_encoding[pos]
 
-                    if not self.pos_char_acceptance(pos)[byte]:
+                    if not self.pos_char_acceptance(pos, byte):
                         continue
 
                     pos += num_chars + 1
@@ -657,14 +668,14 @@ class TokenTrie:
                 token = tokenizer.convert_ids_to_tokens(id)
                 token = re.sub(r"<0x([0-9a-fA-F]{2})>", replace_hex, token)
                 token = token.replace("▁", " ")
-                return bytes(token, "utf-8")
+                return token
 
         else:
             print("Warning: unrecognized tokenizer: using default token formatting")
 
             def fmt_token(id):
                 token = tokenizer.convert_ids_to_tokens(id)
-                return bytes(token, "utf-8")
+                return token
 
         # note: vocab_size doesn't work here because there are also
         # get_added_vocab() tokens

From d7bd3da35ed6c10b6ebcd7fd913f1e39221c7650 Mon Sep 17 00:00:00 2001
From: Guanghua Lu <102669562+Touch-Night@users.noreply.github.com>
Date: Mon, 20 May 2024 07:17:26 +0800
Subject: [PATCH 06/64] Add Llama 3 instruction template (#5891)

---
 instruction-templates/Llama-v3.yaml | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 instruction-templates/Llama-v3.yaml

diff --git a/instruction-templates/Llama-v3.yaml b/instruction-templates/Llama-v3.yaml
new file mode 100644
index 00000000..ba96c7e2
--- /dev/null
+++ b/instruction-templates/Llama-v3.yaml
@@ -0,0 +1,13 @@
+instruction_template: |-
+  {%- set ns = namespace(found=false) -%}
+  {%- for message in messages -%}
+      {%- if message['role'] == 'system' -%}
+          {%- set ns.found = true -%}
+      {%- endif -%}
+  {%- endfor -%}
+  {%- for message in messages -%}
+      {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'].rstrip() + '<|eot_id|>' -}}
+  {%- endfor -%}
+  {%- if add_generation_prompt -%}
+      {{-'<|start_header_id|>assistant<|end_header_id|>\n\n'-}}
+  {%- endif -%}

From 907702c204c6a200ebbdec0f4b4471136662ccf5 Mon Sep 17 00:00:00 2001
From: Tisjwlf <156173182+Tisjwlf@users.noreply.github.com>
Date: Mon, 20 May 2024 01:22:09 +0200
Subject: [PATCH 07/64] Fix gguf multipart file loading (#5857)

---
 modules/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/models.py b/modules/models.py
index 687af8ba..cac66393 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -266,7 +266,7 @@ def llamacpp_loader(model_name):
     if path.is_file():
         model_file = path
     else:
-        model_file = list(Path(f'{shared.args.model_dir}/{model_name}').glob('*.gguf'))[0]
+        model_file = sorted(Path(f'{shared.args.model_dir}/{model_name}').glob('*.gguf'))[0] 
 
     logger.info(f"llama.cpp weights detected: \"{model_file}\"")
     model, tokenizer = LlamaCppModel.from_pretrained(model_file)

From 818b4e0354bf77717d219c73f4e54158a3bb12a1 Mon Sep 17 00:00:00 2001
From: altoiddealer <altoiddealer@gmail.com>
Date: Sun, 19 May 2024 19:26:09 -0400
Subject: [PATCH 08/64] Let grammar escape backslashes (#5865)

---
 modules/grammar/grammar_utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modules/grammar/grammar_utils.py b/modules/grammar/grammar_utils.py
index 140748d5..7f09ff82 100644
--- a/modules/grammar/grammar_utils.py
+++ b/modules/grammar/grammar_utils.py
@@ -131,6 +131,8 @@ def parse_char(src):
             return "\n", src[2:]
         elif esc == "t":
             return "\t", src[2:]
+        elif esc == "\\":
+            return "\\", src[2:]
         raise RuntimeError("unknown escape at " + src)
     elif src:
         return src[0], src[1:]

From 9f77ed1b987969b077d36438303183d3e2609f5a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 19 May 2024 23:29:39 -0300
Subject: [PATCH 09/64] --idle-timeout flag to unload the model if unused for N
 minutes (#6026)

---
 modules/chat.py            |  8 --------
 modules/logits.py          | 22 ++++++++++++++++++++--
 modules/models.py          | 22 ++++++++++++++++++++++
 modules/shared.py          |  2 ++
 modules/text_generation.py |  7 ++++++-
 server.py                  |  9 +++++++--
 6 files changed, 57 insertions(+), 13 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index 4e0bde1c..43f5466b 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -308,9 +308,6 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
                     'internal': output['internal']
                 }
 
-    if shared.model_name == 'None' or shared.model is None:
-        raise ValueError("No model is loaded! Select one in the Model tab.")
-
     # Generate the prompt
     kwargs = {
         '_continue': _continue,
@@ -355,11 +352,6 @@ def impersonate_wrapper(text, state):
 
     static_output = chat_html_wrapper(state['history'], state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
 
-    if shared.model_name == 'None' or shared.model is None:
-        logger.error("No model is loaded! Select one in the Model tab.")
-        yield '', static_output
-        return
-
     prompt = generate_chat_prompt('', state, impersonate=True)
     stopping_strings = get_stopping_strings(state)
 
diff --git a/modules/logits.py b/modules/logits.py
index f2fd233b..3e793bd0 100644
--- a/modules/logits.py
+++ b/modules/logits.py
@@ -1,14 +1,32 @@
+import time
+
 import torch
 from transformers import is_torch_npu_available, is_torch_xpu_available
 
-from modules import sampler_hijack, shared
+from modules import models, sampler_hijack, shared
 from modules.logging_colors import logger
+from modules.models import load_model
 from modules.text_generation import generate_reply
 
 global_scores = None
 
 
-def get_next_logits(prompt, state, use_samplers, previous, top_logits=25, return_dict=False):
+def get_next_logits(*args, **kwargs):
+    if shared.args.idle_timeout > 0 and shared.model is None and shared.previous_model_name not in [None, 'None']:
+        shared.model, shared.tokenizer = load_model(shared.previous_model_name)
+
+    shared.generation_lock.acquire()
+    try:
+        result = _get_next_logits(*args, **kwargs)
+    except:
+        result = None
+
+    models.last_generation_time = time.time()
+    shared.generation_lock.release()
+    return result
+
+
+def _get_next_logits(prompt, state, use_samplers, previous, top_logits=25, return_dict=False):
     if shared.model is None:
         logger.error("No model is loaded! Select one in the Model tab.")
         return 'Error: No model is loaded1 Select one in the Model tab.', previous
diff --git a/modules/models.py b/modules/models.py
index cac66393..b03e1c9d 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -61,6 +61,9 @@ if shared.args.deepspeed:
 sampler_hijack.hijack_samplers()
 
 
+last_generation_time = time.time()
+
+
 def load_model(model_name, loader=None):
     logger.info(f"Loading \"{model_name}\"")
     t0 = time.time()
@@ -428,6 +431,7 @@ def clear_torch_cache():
 
 def unload_model():
     shared.model = shared.tokenizer = None
+    shared.previous_model_name = shared.model_name
     shared.model_name = 'None'
     shared.lora_names = []
     shared.model_dirty_from_training = False
@@ -437,3 +441,21 @@ def unload_model():
 def reload_model():
     unload_model()
     shared.model, shared.tokenizer = load_model(shared.model_name)
+
+
+def unload_model_if_idle():
+    global last_generation_time
+
+    logger.info(f"Setting a timeout of {shared.args.idle_timeout} minutes to unload the model in case of inactivity.")
+
+    while True:
+        shared.generation_lock.acquire()
+        try:
+            if time.time() - last_generation_time > shared.args.idle_timeout * 60:
+                if shared.model is not None:
+                    logger.info("Unloading the model for inactivity.")
+                    unload_model()
+        finally:
+            shared.generation_lock.release()
+
+        time.sleep(60)
diff --git a/modules/shared.py b/modules/shared.py
index a3ce584c..645ba701 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -13,6 +13,7 @@ from modules.logging_colors import logger
 model = None
 tokenizer = None
 model_name = 'None'
+previous_model_name = 'None'
 is_seq2seq = False
 model_dirty_from_training = False
 lora_names = []
@@ -84,6 +85,7 @@ group.add_argument('--settings', type=str, help='Load the default interface sett
 group.add_argument('--extensions', type=str, nargs='+', help='The list of extensions to load. If you want to load more than one extension, write the names separated by spaces.')
 group.add_argument('--verbose', action='store_true', help='Print the prompts to the terminal.')
 group.add_argument('--chat-buttons', action='store_true', help='Show buttons on the chat tab instead of a hover menu.')
+group.add_argument('--idle-timeout', type=int, default=0, help='Unload model after this many minutes of inactivity. It will be automatically reloaded when you try to use it again.')
 
 # Model loader
 group = parser.add_argument_group('Model loader')
diff --git a/modules/text_generation.py b/modules/text_generation.py
index 79067f84..afcdaddb 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -16,6 +16,7 @@ from transformers import (
 )
 
 import modules.shared as shared
+from modules import models
 from modules.cache_utils import process_llamacpp_cache
 from modules.callbacks import (
     Iteratorize,
@@ -27,15 +28,19 @@ from modules.grammar.grammar_utils import initialize_grammar
 from modules.grammar.logits_process import GrammarConstrainedLogitsProcessor
 from modules.html_generator import generate_basic_html
 from modules.logging_colors import logger
-from modules.models import clear_torch_cache
+from modules.models import clear_torch_cache, load_model
 
 
 def generate_reply(*args, **kwargs):
+    if shared.args.idle_timeout > 0 and shared.model is None and shared.previous_model_name not in [None, 'None']:
+        shared.model, shared.tokenizer = load_model(shared.previous_model_name)
+
     shared.generation_lock.acquire()
     try:
         for result in _generate_reply(*args, **kwargs):
             yield result
     finally:
+        models.last_generation_time = time.time()
         shared.generation_lock.release()
 
 
diff --git a/server.py b/server.py
index 4b5185be..04a5b16d 100644
--- a/server.py
+++ b/server.py
@@ -32,7 +32,7 @@ import sys
 import time
 from functools import partial
 from pathlib import Path
-from threading import Lock
+from threading import Lock, Thread
 
 import yaml
 
@@ -52,7 +52,7 @@ from modules import (
 )
 from modules.extensions import apply_extensions
 from modules.LoRA import add_lora_to_model
-from modules.models import load_model
+from modules.models import load_model, unload_model_if_idle
 from modules.models_settings import (
     get_fallback_settings,
     get_model_metadata,
@@ -245,6 +245,11 @@ if __name__ == "__main__":
 
     shared.generation_lock = Lock()
 
+    if shared.args.idle_timeout > 0:
+        timer_thread = Thread(target=unload_model_if_idle)
+        timer_thread.daemon = True
+        timer_thread.start()
+
     if shared.args.nowebui:
         # Start the API in standalone mode
         shared.args.extensions = [x for x in shared.args.extensions if x != 'gallery']

From 852c9437690893bd953afacbb092eb929d139f32 Mon Sep 17 00:00:00 2001
From: Philipp Emanuel Weidmann <pew@worldwidemann.com>
Date: Mon, 20 May 2024 08:23:47 +0530
Subject: [PATCH 10/64] DRY: A modern repetition penalty that reliably prevents
 looping (#5677)

---
 extensions/openai/typing.py |   4 ++
 modules/loaders.py          |  12 ++++
 modules/presets.py          |   4 ++
 modules/sampler_hijack.py   | 111 +++++++++++++++++++++++++++++++++---
 modules/text_generation.py  |   2 +-
 modules/ui.py               |   4 ++
 modules/ui_parameters.py    |   8 +++
 7 files changed, 137 insertions(+), 8 deletions(-)

diff --git a/extensions/openai/typing.py b/extensions/openai/typing.py
index 2b30ebf2..4015f6a1 100644
--- a/extensions/openai/typing.py
+++ b/extensions/openai/typing.py
@@ -33,6 +33,10 @@ class GenerationOptions(BaseModel):
     seed: int = -1
     encoder_repetition_penalty: float = 1
     no_repeat_ngram_size: int = 0
+    dry_multiplier: float = 0
+    dry_base: float = 1.75
+    dry_allowed_length: int = 2
+    dry_sequence_breakers: str = '"\\n", ":", "\\"", "*"'
     truncation_length: int = 0
     max_tokens_second: int = 0
     prompt_lookup_num_tokens: int = 0
diff --git a/modules/loaders.py b/modules/loaders.py
index e7ff2d21..fa7595bf 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -178,6 +178,10 @@ def transformers_samplers():
         'repetition_penalty_range',
         'encoder_repetition_penalty',
         'no_repeat_ngram_size',
+        'dry_multiplier',
+        'dry_base',
+        'dry_allowed_length',
+        'dry_sequence_breakers',
         'seed',
         'do_sample',
         'penalty_alpha',
@@ -251,6 +255,10 @@ loaders_samplers = {
         'repetition_penalty_range',
         'encoder_repetition_penalty',
         'no_repeat_ngram_size',
+        'dry_multiplier',
+        'dry_base',
+        'dry_allowed_length',
+        'dry_sequence_breakers',
         'seed',
         'do_sample',
         'mirostat_mode',
@@ -309,6 +317,10 @@ loaders_samplers = {
         'repetition_penalty_range',
         'encoder_repetition_penalty',
         'no_repeat_ngram_size',
+        'dry_multiplier',
+        'dry_base',
+        'dry_allowed_length',
+        'dry_sequence_breakers',
         'seed',
         'do_sample',
         'mirostat_mode',
diff --git a/modules/presets.py b/modules/presets.py
index a7cda31c..b00e829e 100644
--- a/modules/presets.py
+++ b/modules/presets.py
@@ -40,6 +40,10 @@ def default_preset():
         'do_sample': True,
         'encoder_repetition_penalty': 1,
         'no_repeat_ngram_size': 0,
+        'dry_multiplier': 0,
+        'dry_base': 1.75,
+        'dry_allowed_length': 2,
+        'dry_sequence_breakers': '"\\n", ":", "\\"", "*"',
         'sampler_priority': 'temperature\ndynamic_temperature\nquadratic_sampling\ntop_k\ntop_p\ntypical_p\nepsilon_cutoff\neta_cutoff\ntfs\ntop_a\nmin_p\nmirostat'
     }
 
diff --git a/modules/sampler_hijack.py b/modules/sampler_hijack.py
index da52f4d0..1f9ca52c 100644
--- a/modules/sampler_hijack.py
+++ b/modules/sampler_hijack.py
@@ -1,3 +1,4 @@
+import json
 import math
 import pprint
 
@@ -220,6 +221,74 @@ class TopALogitsWarper(LogitsWarper):
         return scores
 
 
+class DRYLogitsProcessor(LogitsProcessor):
+    def __init__(self, multiplier: float, base: float, allowed_length: int, sequence_breakers: set[int], _range: int):
+        self.multiplier = multiplier
+        self.base = base
+        self.allowed_length = allowed_length
+        self.sequence_breakers = sequence_breakers
+        self._range = _range
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        if self._range > 0:
+            input_ids = input_ids[:, -self._range:]
+
+        for input_ids_row, scores_row in zip(input_ids, scores):
+            # Raw integer must be extracted here to check for set membership.
+            last_token = input_ids_row[-1].item()
+
+            if last_token in self.sequence_breakers:
+                continue
+
+            # Exclude the last token as it always matches.
+            match_indices = (input_ids_row[:-1] == last_token).nonzero()
+
+            # Stores the maximum matching sequence length
+            # for each token immediately following the sequence in the input.
+            match_lengths = {}
+
+            for i in match_indices:
+                next_token = input_ids_row[i+1].item()
+
+                if next_token in self.sequence_breakers:
+                    continue
+
+                # We have already found that `last_token` matches at this index,
+                # so the match is at least of length 1.
+                match_length = 1
+
+                # Extend the match backwards as far as possible.
+                while True:
+                    j = i - match_length
+                    if j < 0:
+                        # Start of input reached.
+                        break
+
+                    previous_token = input_ids_row[-(match_length+1)].item()
+                    if input_ids_row[j] != previous_token:
+                        # Start of match reached.
+                        break
+
+                    if previous_token in self.sequence_breakers:
+                        # Sequence-breaking token reached.
+                        break
+
+                    match_length += 1
+
+                if next_token in match_lengths:
+                    match_lengths[next_token] = max(match_length, match_lengths[next_token])
+                else:
+                    match_lengths[next_token] = match_length
+
+            # Apply penalties.
+            for token, match_length in match_lengths.items():
+                if match_length >= self.allowed_length:
+                    penalty = self.multiplier * self.base ** (match_length - self.allowed_length)
+                    scores_row[token] -= penalty
+
+        return scores
+
+
 class MirostatLogitsWarper(LogitsWarper):
     def __init__(self, mirostat_mode: int, mirostat_tau: float, mirostat_eta: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
         if mirostat_mode not in [2]:
@@ -448,20 +517,44 @@ def get_logits_warper_patch(self, generation_config):
 
 
 def get_logits_processor_patch(self, **kwargs):
-    repetition_penalty = kwargs['generation_config'].repetition_penalty
-    presence_penalty = kwargs['generation_config'].presence_penalty
-    frequency_penalty = kwargs['generation_config'].frequency_penalty
-    repetition_penalty_range = kwargs['generation_config'].repetition_penalty_range
-    do_rep_pen_hijack = (repetition_penalty > 1) or (presence_penalty != 0) or (frequency_penalty != 0)
+    generation_config = kwargs['generation_config']
+
+    do_rep_pen_hijack = (generation_config.repetition_penalty > 1) or (generation_config.presence_penalty != 0) or (generation_config.frequency_penalty != 0)
     if do_rep_pen_hijack:
-        kwargs['generation_config'].repetition_penalty = 1.1  # Set to value > 1 to ensure RepetitionPenaltyLogitsProcessor is created
+        generation_config.repetition_penalty = 1.1  # Set to value > 1 to ensure RepetitionPenaltyLogitsProcessor is created
 
     result = self._get_logits_processor_old(**kwargs)
 
     if do_rep_pen_hijack:
         for i in range(len(result)):
             if result[i].__class__.__name__ == 'RepetitionPenaltyLogitsProcessor':
-                result[i] = RepetitionPenaltyLogitsProcessorWithRange(repetition_penalty, presence_penalty, frequency_penalty, repetition_penalty_range)
+                result[i] = RepetitionPenaltyLogitsProcessorWithRange(
+                    generation_config.repetition_penalty,
+                    generation_config.presence_penalty,
+                    generation_config.frequency_penalty,
+                    generation_config.repetition_penalty_range
+                )
+
+    if generation_config.dry_multiplier is not None and generation_config.dry_multiplier > 0.0:
+        dry_sequence_breakers = generation_config.dry_sequence_breakers
+
+        # Support both JSON array notation and comma-separated strings.
+        if not dry_sequence_breakers.startswith("["):
+            dry_sequence_breakers = "[" + dry_sequence_breakers + "]"
+
+        sequence_breaker_strings = json.loads(dry_sequence_breakers)
+        # Prefix with 'a' to get the correct encoding of the token at the end of a text.
+        sequence_breakers = {shared.tokenizer.encode(f'a{s}')[-1] for s in sequence_breaker_strings}
+
+        result.append(
+            DRYLogitsProcessor(
+                multiplier=generation_config.dry_multiplier,
+                base=generation_config.dry_base,
+                allowed_length=generation_config.dry_allowed_length,
+                sequence_breakers=sequence_breakers,
+                _range=generation_config.repetition_penalty_range,
+            )
+        )
 
     return result
 
@@ -483,6 +576,10 @@ def generation_config_init_patch(self, **kwargs):
     self.repetition_penalty_range = kwargs.pop("repetition_penalty_range", 0)
     self.presence_penalty = kwargs.pop("presence_penalty", 0)
     self.frequency_penalty = kwargs.pop("frequency_penalty", 0)
+    self.dry_multiplier = kwargs.pop("dry_multiplier", 0.0)
+    self.dry_base = kwargs.pop("dry_base", 1.75)
+    self.dry_allowed_length = kwargs.pop("dry_allowed_length", 2)
+    self.dry_sequence_breakers = kwargs.pop("dry_sequence_breakers", '"\\n", ":", "\\"", "*"')
     self.temperature_last = kwargs.pop("temperature_last", False)
     self.sampler_priority = kwargs.pop("sampler_priority", ['temperature', 'dynamic_temperature', 'quadratic_sampling', 'top_k', 'top_p', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'tfs', 'top_a', 'min_p', 'mirostat'])
 
diff --git a/modules/text_generation.py b/modules/text_generation.py
index afcdaddb..75294013 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -272,7 +272,7 @@ def get_reply_from_output_ids(output_ids, state=None, starting_from=0):
 
 def generate_reply_HF(question, original_question, seed, state, stopping_strings=None, is_chat=False):
     generate_params = {}
-    for k in ['max_new_tokens', 'temperature', 'temperature_last', 'dynamic_temperature', 'dynatemp_low', 'dynatemp_high', 'dynatemp_exponent', 'smoothing_factor', 'smoothing_curve', 'top_p', 'min_p', 'top_k', 'repetition_penalty', 'presence_penalty', 'frequency_penalty', 'repetition_penalty_range', 'typical_p', 'tfs', 'top_a', 'guidance_scale', 'penalty_alpha', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'do_sample', 'encoder_repetition_penalty', 'no_repeat_ngram_size']:
+    for k in ['max_new_tokens', 'temperature', 'temperature_last', 'dynamic_temperature', 'dynatemp_low', 'dynatemp_high', 'dynatemp_exponent', 'smoothing_factor', 'smoothing_curve', 'top_p', 'min_p', 'top_k', 'repetition_penalty', 'presence_penalty', 'frequency_penalty', 'repetition_penalty_range', 'typical_p', 'tfs', 'top_a', 'guidance_scale', 'penalty_alpha', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'do_sample', 'encoder_repetition_penalty', 'no_repeat_ngram_size', 'dry_multiplier', 'dry_base', 'dry_allowed_length', 'dry_sequence_breakers']:
         if k in state:
             generate_params[k] = state[k]
 
diff --git a/modules/ui.py b/modules/ui.py
index 37526168..992616de 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -147,6 +147,10 @@ def list_interface_input_elements():
         'repetition_penalty_range',
         'encoder_repetition_penalty',
         'no_repeat_ngram_size',
+        'dry_multiplier',
+        'dry_base',
+        'dry_allowed_length',
+        'dry_sequence_breakers',
         'do_sample',
         'penalty_alpha',
         'mirostat_mode',
diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py
index 0414cdde..d62b74c1 100644
--- a/modules/ui_parameters.py
+++ b/modules/ui_parameters.py
@@ -38,6 +38,14 @@ def create_ui(default_preset):
                             shared.gradio['presence_penalty'] = gr.Slider(0, 2, value=generate_params['presence_penalty'], step=0.05, label='presence_penalty')
                             shared.gradio['repetition_penalty_range'] = gr.Slider(0, 4096, step=64, value=generate_params['repetition_penalty_range'], label='repetition_penalty_range')
                             shared.gradio['do_sample'] = gr.Checkbox(value=generate_params['do_sample'], label='do_sample')
+
+                            with gr.Blocks():
+                                gr.Markdown("[DRY sequence repetition penalty](https://github.com/oobabooga/text-generation-webui/pull/5677)")
+                                shared.gradio['dry_multiplier'] = gr.Slider(0, 5, value=generate_params['dry_multiplier'], step=0.01, label='dry_multiplier', info='Set to value > 0 to enable DRY. Controls the magnitude of the penalty for the shortest penalized sequences.')
+                                shared.gradio['dry_base'] = gr.Slider(1, 4, value=generate_params['dry_base'], step=0.01, label='dry_base', info='Controls how fast the penalty grows with increasing sequence length.')
+                                shared.gradio['dry_allowed_length'] = gr.Slider(1, 20, value=generate_params['dry_allowed_length'], step=1, label='dry_allowed_length', info='Longest sequence that can be repeated without being penalized.')
+                                shared.gradio['dry_sequence_breakers'] = gr.Textbox(value=generate_params['dry_sequence_breakers'], label='dry_sequence_breakers', info='Tokens across which sequence matching is not continued. Specified as a comma-separated list of quoted strings.')
+
                             gr.Markdown("[Learn more](https://github.com/oobabooga/text-generation-webui/wiki/03-%E2%80%90-Parameters-Tab)")
 
                         with gr.Column():

From 6a1682aa952c4870faeab071e7bfc045f961ac45 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 19 May 2024 20:28:21 -0700
Subject: [PATCH 11/64] README: update command-line flags with raw --help
 output

This helps me keep this up-to-date more easily.
---
 README.md | 275 +++++++++++++++++++++++++-----------------------------
 1 file changed, 129 insertions(+), 146 deletions(-)

diff --git a/README.md b/README.md
index a3414387..3a58ca63 100644
--- a/README.md
+++ b/README.md
@@ -200,168 +200,151 @@ pip install -r <requirements file that you have used> --upgrade
 List of command-line flags
 </summary>
 
-#### Basic settings
+```txt
+usage: server.py [-h] [--multi-user] [--character CHARACTER] [--model MODEL] [--lora LORA [LORA ...]] [--model-dir MODEL_DIR] [--lora-dir LORA_DIR] [--model-menu] [--settings SETTINGS]
+                 [--extensions EXTENSIONS [EXTENSIONS ...]] [--verbose] [--chat-buttons] [--idle-timeout IDLE_TIMEOUT] [--loader LOADER] [--cpu] [--auto-devices]
+                 [--gpu-memory GPU_MEMORY [GPU_MEMORY ...]] [--cpu-memory CPU_MEMORY] [--disk] [--disk-cache-dir DISK_CACHE_DIR] [--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code]
+                 [--force-safetensors] [--no_use_fast] [--use_flash_attention_2] [--load-in-4bit] [--use_double_quant] [--compute_dtype COMPUTE_DTYPE] [--quant_type QUANT_TYPE] [--flash-attn]
+                 [--tensorcores] [--n_ctx N_CTX] [--threads THREADS] [--threads-batch THREADS_BATCH] [--no_mul_mat_q] [--n_batch N_BATCH] [--no-mmap] [--mlock] [--n-gpu-layers N_GPU_LAYERS]
+                 [--tensor_split TENSOR_SPLIT] [--numa] [--logits_all] [--no_offload_kqv] [--cache-capacity CACHE_CAPACITY] [--row_split] [--streaming-llm] [--attention-sink-size ATTENTION_SINK_SIZE]
+                 [--gpu-split GPU_SPLIT] [--autosplit] [--max_seq_len MAX_SEQ_LEN] [--cfg-cache] [--no_flash_attn] [--cache_8bit] [--cache_4bit] [--num_experts_per_token NUM_EXPERTS_PER_TOKEN]
+                 [--triton] [--no_inject_fused_attention] [--no_inject_fused_mlp] [--no_use_cuda_fp16] [--desc_act] [--disable_exllama] [--disable_exllamav2] [--wbits WBITS] [--model_type MODEL_TYPE]
+                 [--groupsize GROUPSIZE] [--pre_layer PRE_LAYER [PRE_LAYER ...]] [--checkpoint CHECKPOINT] [--monkey-patch] [--hqq-backend HQQ_BACKEND] [--deepspeed]
+                 [--nvme-offload-dir NVME_OFFLOAD_DIR] [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE] [--compress_pos_emb COMPRESS_POS_EMB] [--listen]
+                 [--listen-port LISTEN_PORT] [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH] [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE]
+                 [--ssl-certfile SSL_CERTFILE] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY] [--nowebui]
+                 [--multimodal-pipeline MULTIMODAL_PIPELINE]
 
-| Flag                                       | Description |
-|--------------------------------------------|-------------|
-| `-h`, `--help`                             | show this help message and exit |
-| `--multi-user`                             | Multi-user mode. Chat histories are not saved or automatically loaded. WARNING: this is likely not safe for sharing publicly. |
-| `--character CHARACTER`                    | The name of the character to load in chat mode by default. |
-| `--model MODEL`                            | Name of the model to load by default. |
-| `--lora LORA [LORA ...]`                   | The list of LoRAs to load. If you want to load more than one LoRA, write the names separated by spaces. |
-| `--model-dir MODEL_DIR`                    | Path to directory with all the models. |
-| `--lora-dir LORA_DIR`                      | Path to directory with all the loras. |
-| `--model-menu`                             | Show a model menu in the terminal when the web UI is first launched. |
-| `--settings SETTINGS_FILE`                 | Load the default interface settings from this yaml file. See `settings-template.yaml` for an example. If you create a file called `settings.yaml`, this file will be loaded by default without the need to use the `--settings` flag. |
-| `--extensions EXTENSIONS [EXTENSIONS ...]` | The list of extensions to load. If you want to load more than one extension, write the names separated by spaces. |
-| `--verbose`                                | Print the prompts to the terminal. |
-| `--chat-buttons`                           | Show buttons on the chat tab instead of a hover menu. |
+Text generation web UI
 
-#### Model loader
+options:
+  -h, --help                                     show this help message and exit
 
-| Flag                                       | Description |
-|--------------------------------------------|-------------|
-| `--loader LOADER`                          | Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ, AutoAWQ, GPTQ-for-LLaMa, QuIP#. |
+Basic settings:
+  --multi-user                                   Multi-user mode. Chat histories are not saved or automatically loaded. Warning: this is likely not safe for sharing publicly.
+  --character CHARACTER                          The name of the character to load in chat mode by default.
+  --model MODEL                                  Name of the model to load by default.
+  --lora LORA [LORA ...]                         The list of LoRAs to load. If you want to load more than one LoRA, write the names separated by spaces.
+  --model-dir MODEL_DIR                          Path to directory with all the models.
+  --lora-dir LORA_DIR                            Path to directory with all the loras.
+  --model-menu                                   Show a model menu in the terminal when the web UI is first launched.
+  --settings SETTINGS                            Load the default interface settings from this yaml file. See settings-template.yaml for an example. If you create a file called settings.yaml, this
+                                                 file will be loaded by default without the need to use the --settings flag.
+  --extensions EXTENSIONS [EXTENSIONS ...]       The list of extensions to load. If you want to load more than one extension, write the names separated by spaces.
+  --verbose                                      Print the prompts to the terminal.
+  --chat-buttons                                 Show buttons on the chat tab instead of a hover menu.
+  --idle-timeout IDLE_TIMEOUT                    Unload model after this many minutes of inactivity. It will be automatically reloaded when you try to use it again.
 
-#### Accelerate/transformers
+Model loader:
+  --loader LOADER                                Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2,
+                                                 AutoGPTQ, AutoAWQ, GPTQ-for-LLaMa, QuIP#.
 
-| Flag                                        | Description |
-|---------------------------------------------|-------------|
-| `--cpu`                                     | Use the CPU to generate text. Warning: Training on CPU is extremely slow. |
-| `--auto-devices`                            | Automatically split the model across the available GPU(s) and CPU. |
-|  `--gpu-memory GPU_MEMORY [GPU_MEMORY ...]` | Maximum GPU memory in GiB to be allocated per GPU. Example: --gpu-memory 10 for a single GPU, --gpu-memory 10 5 for two GPUs. You can also set values in MiB like --gpu-memory 3500MiB. |
-| `--cpu-memory CPU_MEMORY`                   | Maximum CPU memory in GiB to allocate for offloaded weights. Same as above. |
-| `--disk`                                    | If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk. |
-| `--disk-cache-dir DISK_CACHE_DIR`           | Directory to save the disk cache to. Defaults to "cache". |
-| `--load-in-8bit`                            | Load the model with 8-bit precision (using bitsandbytes). |
-| `--bf16`                                    | Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU. |
-| `--no-cache`                                | Set `use_cache` to `False` while generating text. This reduces VRAM usage slightly, but it comes at a performance cost. |
-| `--trust-remote-code`                       | Set `trust_remote_code=True` while loading the model. Necessary for some models. |
-| `--no_use_fast`                             | Set use_fast=False while loading the tokenizer (it's True by default). Use this if you have any problems related to use_fast. |
-| `--use_flash_attention_2`                   | Set use_flash_attention_2=True while loading the model. |
+Transformers/Accelerate:
+  --cpu                                          Use the CPU to generate text. Warning: Training on CPU is extremely slow.
+  --auto-devices                                 Automatically split the model across the available GPU(s) and CPU.
+  --gpu-memory GPU_MEMORY [GPU_MEMORY ...]       Maximum GPU memory in GiB to be allocated per GPU. Example: --gpu-memory 10 for a single GPU, --gpu-memory 10 5 for two GPUs. You can also set values
+                                                 in MiB like --gpu-memory 3500MiB.
+  --cpu-memory CPU_MEMORY                        Maximum CPU memory in GiB to allocate for offloaded weights. Same as above.
+  --disk                                         If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.
+  --disk-cache-dir DISK_CACHE_DIR                Directory to save the disk cache to. Defaults to "cache".
+  --load-in-8bit                                 Load the model with 8-bit precision (using bitsandbytes).
+  --bf16                                         Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.
+  --no-cache                                     Set use_cache to False while generating text. This reduces VRAM usage slightly, but it comes at a performance cost.
+  --trust-remote-code                            Set trust_remote_code=True while loading the model. Necessary for some models.
+  --force-safetensors                            Set use_safetensors=True while loading the model. This prevents arbitrary code execution.
+  --no_use_fast                                  Set use_fast=False while loading the tokenizer (it's True by default). Use this if you have any problems related to use_fast.
+  --use_flash_attention_2                        Set use_flash_attention_2=True while loading the model.
 
-#### bitsandbytes 4-bit
+bitsandbytes 4-bit:
+  --load-in-4bit                                 Load the model with 4-bit precision (using bitsandbytes).
+  --use_double_quant                             use_double_quant for 4-bit.
+  --compute_dtype COMPUTE_DTYPE                  compute dtype for 4-bit. Valid options: bfloat16, float16, float32.
+  --quant_type QUANT_TYPE                        quant_type for 4-bit. Valid options: nf4, fp4.
 
-⚠️  Requires minimum compute of 7.0 on Windows at the moment.
+llama.cpp:
+  --flash-attn                                   Use flash-attention.
+  --tensorcores                                  Use llama-cpp-python compiled with tensor cores support. This increases performance on RTX cards. NVIDIA only.
+  --n_ctx N_CTX                                  Size of the prompt context.
+  --threads THREADS                              Number of threads to use.
+  --threads-batch THREADS_BATCH                  Number of threads to use for batches/prompt processing.
+  --no_mul_mat_q                                 Disable the mulmat kernels.
+  --n_batch N_BATCH                              Maximum number of prompt tokens to batch together when calling llama_eval.
+  --no-mmap                                      Prevent mmap from being used.
+  --mlock                                        Force the system to keep the model in RAM.
+  --n-gpu-layers N_GPU_LAYERS                    Number of layers to offload to the GPU.
+  --tensor_split TENSOR_SPLIT                    Split the model across multiple GPUs. Comma-separated list of proportions. Example: 18,17.
+  --numa                                         Activate NUMA task allocation for llama.cpp.
+  --logits_all                                   Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower.
+  --no_offload_kqv                               Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.
+  --cache-capacity CACHE_CAPACITY                Maximum cache capacity (llama-cpp-python). Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed.
+  --row_split                                    Split the model by rows across GPUs. This may improve multi-gpu performance.
+  --streaming-llm                                Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.
+  --attention-sink-size ATTENTION_SINK_SIZE      StreamingLLM: number of sink tokens. Only used if the trimmed prompt does not share a prefix with the old prompt.
 
-| Flag                                        | Description |
-|---------------------------------------------|-------------|
-| `--load-in-4bit`                            | Load the model with 4-bit precision (using bitsandbytes). |
-| `--use_double_quant`                        | use_double_quant for 4-bit. |
-| `--compute_dtype COMPUTE_DTYPE`             | compute dtype for 4-bit. Valid options: bfloat16, float16, float32. |
-| `--quant_type QUANT_TYPE`                   | quant_type for 4-bit. Valid options: nf4, fp4. |
+ExLlamaV2:
+  --gpu-split GPU_SPLIT                          Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7.
+  --autosplit                                    Autosplit the model tensors across the available GPUs. This causes --gpu-split to be ignored.
+  --max_seq_len MAX_SEQ_LEN                      Maximum sequence length.
+  --cfg-cache                                    ExLlamav2_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.
+  --no_flash_attn                                Force flash-attention to not be used.
+  --cache_8bit                                   Use 8-bit cache to save VRAM.
+  --cache_4bit                                   Use Q4 cache to save VRAM.
+  --num_experts_per_token NUM_EXPERTS_PER_TOKEN  Number of experts to use for generation. Applies to MoE models like Mixtral.
 
-#### llama.cpp
+AutoGPTQ:
+  --triton                                       Use triton.
+  --no_inject_fused_attention                    Disable the use of fused attention, which will use less VRAM at the cost of slower inference.
+  --no_inject_fused_mlp                          Triton mode only: disable the use of fused MLP, which will use less VRAM at the cost of slower inference.
+  --no_use_cuda_fp16                             This can make models faster on some systems.
+  --desc_act                                     For models that do not have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig.
+  --disable_exllama                              Disable ExLlama kernel, which can improve inference speed on some systems.
+  --disable_exllamav2                            Disable ExLlamav2 kernel.
 
-| Flag        | Description |
-|-------------|-------------|
-| `--tensorcores`  | Use llama-cpp-python compiled with tensor cores support. This increases performance on RTX cards. NVIDIA only. |
-| `--flash-attn`   | Use flash-attention. |
-| `--n_ctx N_CTX` | Size of the prompt context. |
-| `--threads` | Number of threads to use. |
-| `--threads-batch THREADS_BATCH` | Number of threads to use for batches/prompt processing. |
-| `--no_mul_mat_q` | Disable the mulmat kernels. |
-| `--n_batch` | Maximum number of prompt tokens to batch together when calling llama_eval. |
-| `--no-mmap`   | Prevent mmap from being used. |
-| `--mlock`     | Force the system to keep the model in RAM. |
-| `--n-gpu-layers N_GPU_LAYERS` | Number of layers to offload to the GPU. |
-| `--tensor_split TENSOR_SPLIT`       | Split the model across multiple GPUs. Comma-separated list of proportions. Example: 18,17. |
-| `--numa`      | Activate NUMA task allocation for llama.cpp. |
-| `--logits_all`| Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower. |
-| `--no_offload_kqv` | Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance. |
-| `--cache-capacity CACHE_CAPACITY`   | Maximum cache capacity (llama-cpp-python). Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed. |
-| `--row_split`                               | Split the model by rows across GPUs. This may improve multi-gpu performance. |
-| `--streaming-llm`                           | Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed. |
-| `--attention-sink-size ATTENTION_SINK_SIZE` | StreamingLLM: number of sink tokens. Only used if the trimmed prompt doesn't share a prefix with the old prompt. |
+GPTQ-for-LLaMa:
+  --wbits WBITS                                  Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.
+  --model_type MODEL_TYPE                        Model type of pre-quantized model. Currently LLaMA, OPT, and GPT-J are supported.
+  --groupsize GROUPSIZE                          Group size.
+  --pre_layer PRE_LAYER [PRE_LAYER ...]          The number of layers to allocate to the GPU. Setting this parameter enables CPU offloading for 4-bit models. For multi-gpu, write the numbers separated
+                                                 by spaces, eg --pre_layer 30 60.
+  --checkpoint CHECKPOINT                        The path to the quantized checkpoint file. If not specified, it will be automatically detected.
+  --monkey-patch                                 Apply the monkey patch for using LoRAs with quantized models.
 
-#### ExLlamav2
+HQQ:
+  --hqq-backend HQQ_BACKEND                      Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.
 
-| Flag             | Description |
-|------------------|-------------|
-|`--gpu-split`     | Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7. |
-|`--max_seq_len MAX_SEQ_LEN`           | Maximum sequence length. |
-|`--cfg-cache`                         | ExLlamav2_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader. |
-|`--no_flash_attn`                     | Force flash-attention to not be used. |
-|`--cache_8bit`                        | Use 8-bit cache to save VRAM. |
-|`--cache_4bit`                        | Use Q4 cache to save VRAM. |
-|`--num_experts_per_token NUM_EXPERTS_PER_TOKEN` |  Number of experts to use for generation. Applies to MoE models like Mixtral. |
+DeepSpeed:
+  --deepspeed                                    Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.
+  --nvme-offload-dir NVME_OFFLOAD_DIR            DeepSpeed: Directory to use for ZeRO-3 NVME offloading.
+  --local_rank LOCAL_RANK                        DeepSpeed: Optional argument for distributed setups.
 
-#### AutoGPTQ
+RoPE:
+  --alpha_value ALPHA_VALUE                      Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.
+  --rope_freq_base ROPE_FREQ_BASE                If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63).
+  --compress_pos_emb COMPRESS_POS_EMB            Positional embeddings compression factor. Should be set to (context length) / (model's original context length). Equal to 1/rope_freq_scale.
 
-| Flag             | Description |
-|------------------|-------------|
-| `--triton`                     | Use triton. |
-| `--no_inject_fused_attention`  | Disable the use of fused attention, which will use less VRAM at the cost of slower inference. |
-| `--no_inject_fused_mlp`        | Triton mode only: disable the use of fused MLP, which will use less VRAM at the cost of slower inference. |
-| `--no_use_cuda_fp16`           | This can make models faster on some systems. |
-| `--desc_act`                   | For models that don't have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig. |
-| `--disable_exllama`            | Disable ExLlama kernel, which can improve inference speed on some systems. |
-| `--disable_exllamav2`          | Disable ExLlamav2 kernel. |
+Gradio:
+  --listen                                       Make the web UI reachable from your local network.
+  --listen-port LISTEN_PORT                      The listening port that the server will use.
+  --listen-host LISTEN_HOST                      The hostname that the server will use.
+  --share                                        Create a public URL. This is useful for running the web UI on Google Colab or similar.
+  --auto-launch                                  Open the web UI in the default browser upon launch.
+  --gradio-auth GRADIO_AUTH                      Set Gradio authentication password in the format "username:password". Multiple credentials can also be supplied with "u1:p1,u2:p2,u3:p3".
+  --gradio-auth-path GRADIO_AUTH_PATH            Set the Gradio authentication file path. The file should contain one or more user:password pairs in the same format as above.
+  --ssl-keyfile SSL_KEYFILE                      The path to the SSL certificate key file.
+  --ssl-certfile SSL_CERTFILE                    The path to the SSL certificate cert file.
 
-#### GPTQ-for-LLaMa
+API:
+  --api                                          Enable the API extension.
+  --public-api                                   Create a public URL for the API using Cloudfare.
+  --public-api-id PUBLIC_API_ID                  Tunnel ID for named Cloudflare Tunnel. Use together with public-api option.
+  --api-port API_PORT                            The listening port for the API.
+  --api-key API_KEY                              API authentication key.
+  --admin-key ADMIN_KEY                          API authentication key for admin tasks like loading and unloading models. If not set, will be the same as --api-key.
+  --nowebui                                      Do not launch the Gradio UI. Useful for launching the API in standalone mode.
 
-| Flag                      | Description |
-|---------------------------|-------------|
-| `--wbits WBITS`           | Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported. |
-| `--model_type MODEL_TYPE` | Model type of pre-quantized model. Currently LLaMA, OPT, and GPT-J are supported. |
-| `--groupsize GROUPSIZE`   | Group size. |
-| `--pre_layer PRE_LAYER [PRE_LAYER ...]`  | The number of layers to allocate to the GPU. Setting this parameter enables CPU offloading for 4-bit models. For multi-gpu, write the numbers separated by spaces, eg `--pre_layer 30 60`. |
-| `--checkpoint CHECKPOINT` | The path to the quantized checkpoint file. If not specified, it will be automatically detected. |
-| `--monkey-patch`          | Apply the monkey patch for using LoRAs with quantized models. |
-
-#### HQQ
-
-| Flag        | Description |
-|-------------|-------------|
-| `--hqq-backend` | Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN. |
-
-#### DeepSpeed
-
-| Flag                                  | Description |
-|---------------------------------------|-------------|
-| `--deepspeed`                         | Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration. |
-| `--nvme-offload-dir NVME_OFFLOAD_DIR` | DeepSpeed: Directory to use for ZeRO-3 NVME offloading. |
-| `--local_rank LOCAL_RANK`             | DeepSpeed: Optional argument for distributed setups. |
-
-#### RoPE (for llama.cpp, ExLlamaV2, and transformers)
-
-| Flag             | Description |
-|------------------|-------------|
-| `--alpha_value ALPHA_VALUE`           | Positional embeddings alpha factor for NTK RoPE scaling. Use either this or `compress_pos_emb`, not both. |
-| `--rope_freq_base ROPE_FREQ_BASE`     | If greater than 0, will be used instead of alpha_value. Those two are related by `rope_freq_base = 10000 * alpha_value ^ (64 / 63)`. |
-| `--compress_pos_emb COMPRESS_POS_EMB` | Positional embeddings compression factor. Should be set to `(context length) / (model's original context length)`. Equal to `1/rope_freq_scale`. |
-
-#### Gradio
-
-| Flag                                  | Description |
-|---------------------------------------|-------------|
-| `--listen`                            | Make the web UI reachable from your local network. |
-| `--listen-port LISTEN_PORT`           | The listening port that the server will use. |
-| `--listen-host LISTEN_HOST`           | The hostname that the server will use. |
-| `--share`                             | Create a public URL. This is useful for running the web UI on Google Colab or similar. |
-| `--auto-launch`                       | Open the web UI in the default browser upon launch. |
-| `--gradio-auth USER:PWD`              | Set Gradio authentication password in the format "username:password". Multiple credentials can also be supplied with "u1:p1,u2:p2,u3:p3". |
-| `--gradio-auth-path GRADIO_AUTH_PATH` | Set the Gradio authentication file path. The file should contain one or more user:password pairs in the same format as above. |
-| `--ssl-keyfile SSL_KEYFILE`           | The path to the SSL certificate key file. |
-| `--ssl-certfile SSL_CERTFILE`         | The path to the SSL certificate cert file. |
-
-#### API
-
-| Flag                                  | Description |
-|---------------------------------------|-------------|
-| `--api`                               | Enable the API extension. |
-| `--public-api`                        | Create a public URL for the API using Cloudfare. |
-| `--public-api-id PUBLIC_API_ID`       | Tunnel ID for named Cloudflare Tunnel. Use together with public-api option. |
-| `--api-port API_PORT`                 | The listening port for the API. |
-| `--api-key API_KEY`                   | API authentication key. |
-| `--admin-key ADMIN_KEY`               | API authentication key for admin tasks like loading and unloading models. If not set, will be the same as --api-key. |
-| `--nowebui`                           | Do not launch the Gradio UI. Useful for launching the API in standalone mode. |
-
-#### Multimodal
-
-| Flag                                  | Description |
-|---------------------------------------|-------------|
-| `--multimodal-pipeline PIPELINE`      | The multimodal pipeline to use. Examples: `llava-7b`, `llava-13b`. |
+Multimodal:
+  --multimodal-pipeline MULTIMODAL_PIPELINE      The multimodal pipeline to use. Examples: llava-7b, llava-13b.
+```
 
 </details>
 

From bd7cc4234d0d2cc890c5e023f67741615c44484a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 21 May 2024 13:32:02 -0300
Subject: [PATCH 12/64] Backend cleanup (#6025)

---
 README.md                         |  27 ++---
 docs/04 - Model Tab.md            |   8 --
 docs/08 - Additional Tips.md      |  22 ----
 docs/What Works.md                |   8 +-
 modules/AutoGPTQ_loader.py        |   2 +-
 modules/GPTQ_loader.py            | 171 ------------------------------
 modules/loaders.py                |  34 ------
 modules/models.py                 |  55 +---------
 modules/models_settings.py        |   9 +-
 modules/monkey_patch_gptq_lora.py |  39 -------
 modules/shared.py                 |  24 ++---
 modules/training.py               |  19 ----
 modules/ui_model_menu.py          |   1 -
 one_click.py                      |   2 +-
 requirements.txt                  |  17 +--
 requirements_amd.txt              |  12 +--
 requirements_amd_noavx2.txt       |  12 +--
 requirements_apple_intel.txt      |   4 +-
 requirements_apple_silicon.txt    |   4 +-
 requirements_cpu_only.txt         |   4 +-
 requirements_cpu_only_noavx2.txt  |   4 +-
 requirements_noavx2.txt           |  17 +--
 requirements_nowheels.txt         |   4 +-
 23 files changed, 57 insertions(+), 442 deletions(-)
 delete mode 100644 modules/GPTQ_loader.py
 delete mode 100644 modules/monkey_patch_gptq_lora.py

diff --git a/README.md b/README.md
index 3a58ca63..a699a9ab 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 ## Features
 
 * 3 interface modes: default (two columns), notebook, and chat.
-* Multiple model backends: [Transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp) (through [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)), [ExLlamaV2](https://github.com/turboderp/exllamav2), [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa), [QuIP#](https://github.com/Cornell-RelaxML/quip-sharp).
+* Multiple model backends: [Transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp) (through [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)), [ExLlamaV2](https://github.com/turboderp/exllamav2), [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ).
 * Dropdown menu for quickly switching between different models.
 * Large number of extensions (built-in and user-contributed), including Coqui TTS for realistic voice outputs, Whisper STT for voice inputs, translation, [multimodal pipelines](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/multimodal), vector databases, Stable Diffusion integration, and a lot more. See [the wiki](https://github.com/oobabooga/text-generation-webui/wiki/07-%E2%80%90-Extensions) and [the extensions directory](https://github.com/oobabooga/text-generation-webui-extensions) for details.
 * [Chat with custom characters](https://github.com/oobabooga/text-generation-webui/wiki/03-%E2%80%90-Parameters-Tab#character).
@@ -208,12 +208,12 @@ usage: server.py [-h] [--multi-user] [--character CHARACTER] [--model MODEL] [--
                  [--tensorcores] [--n_ctx N_CTX] [--threads THREADS] [--threads-batch THREADS_BATCH] [--no_mul_mat_q] [--n_batch N_BATCH] [--no-mmap] [--mlock] [--n-gpu-layers N_GPU_LAYERS]
                  [--tensor_split TENSOR_SPLIT] [--numa] [--logits_all] [--no_offload_kqv] [--cache-capacity CACHE_CAPACITY] [--row_split] [--streaming-llm] [--attention-sink-size ATTENTION_SINK_SIZE]
                  [--gpu-split GPU_SPLIT] [--autosplit] [--max_seq_len MAX_SEQ_LEN] [--cfg-cache] [--no_flash_attn] [--cache_8bit] [--cache_4bit] [--num_experts_per_token NUM_EXPERTS_PER_TOKEN]
-                 [--triton] [--no_inject_fused_attention] [--no_inject_fused_mlp] [--no_use_cuda_fp16] [--desc_act] [--disable_exllama] [--disable_exllamav2] [--wbits WBITS] [--model_type MODEL_TYPE]
-                 [--groupsize GROUPSIZE] [--pre_layer PRE_LAYER [PRE_LAYER ...]] [--checkpoint CHECKPOINT] [--monkey-patch] [--hqq-backend HQQ_BACKEND] [--deepspeed]
-                 [--nvme-offload-dir NVME_OFFLOAD_DIR] [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE] [--compress_pos_emb COMPRESS_POS_EMB] [--listen]
-                 [--listen-port LISTEN_PORT] [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH] [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE]
-                 [--ssl-certfile SSL_CERTFILE] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY] [--nowebui]
-                 [--multimodal-pipeline MULTIMODAL_PIPELINE]
+                 [--triton] [--no_inject_fused_mlp] [--no_use_cuda_fp16] [--desc_act] [--disable_exllama] [--disable_exllamav2] [--wbits WBITS] [--groupsize GROUPSIZE] [--no_inject_fused_attention]
+                 [--hqq-backend HQQ_BACKEND] [--deepspeed] [--nvme-offload-dir NVME_OFFLOAD_DIR] [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE]
+                 [--compress_pos_emb COMPRESS_POS_EMB] [--listen] [--listen-port LISTEN_PORT] [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH]
+                 [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT]
+                 [--api-key API_KEY] [--admin-key ADMIN_KEY] [--nowebui] [--multimodal-pipeline MULTIMODAL_PIPELINE] [--model_type MODEL_TYPE] [--pre_layer PRE_LAYER [PRE_LAYER ...]]
+                 [--checkpoint CHECKPOINT] [--monkey-patch]
 
 Text generation web UI
 
@@ -237,7 +237,7 @@ Basic settings:
 
 Model loader:
   --loader LOADER                                Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2,
-                                                 AutoGPTQ, AutoAWQ, GPTQ-for-LLaMa, QuIP#.
+                                                 AutoGPTQ, AutoAWQ.
 
 Transformers/Accelerate:
   --cpu                                          Use the CPU to generate text. Warning: Training on CPU is extremely slow.
@@ -293,21 +293,16 @@ ExLlamaV2:
 
 AutoGPTQ:
   --triton                                       Use triton.
-  --no_inject_fused_attention                    Disable the use of fused attention, which will use less VRAM at the cost of slower inference.
   --no_inject_fused_mlp                          Triton mode only: disable the use of fused MLP, which will use less VRAM at the cost of slower inference.
   --no_use_cuda_fp16                             This can make models faster on some systems.
   --desc_act                                     For models that do not have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig.
   --disable_exllama                              Disable ExLlama kernel, which can improve inference speed on some systems.
   --disable_exllamav2                            Disable ExLlamav2 kernel.
-
-GPTQ-for-LLaMa:
   --wbits WBITS                                  Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.
-  --model_type MODEL_TYPE                        Model type of pre-quantized model. Currently LLaMA, OPT, and GPT-J are supported.
   --groupsize GROUPSIZE                          Group size.
-  --pre_layer PRE_LAYER [PRE_LAYER ...]          The number of layers to allocate to the GPU. Setting this parameter enables CPU offloading for 4-bit models. For multi-gpu, write the numbers separated
-                                                 by spaces, eg --pre_layer 30 60.
-  --checkpoint CHECKPOINT                        The path to the quantized checkpoint file. If not specified, it will be automatically detected.
-  --monkey-patch                                 Apply the monkey patch for using LoRAs with quantized models.
+
+AutoAWQ:
+  --no_inject_fused_attention                    Disable the use of fused attention, which will use less VRAM at the cost of slower inference.
 
 HQQ:
   --hqq-backend HQQ_BACKEND                      Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.
diff --git a/docs/04 - Model Tab.md b/docs/04 - Model Tab.md
index 7c168e89..f44eb964 100644
--- a/docs/04 - Model Tab.md	
+++ b/docs/04 - Model Tab.md	
@@ -64,14 +64,6 @@ Loads: GPTQ models.
 * **no_use_cuda_fp16**: On some systems, the performance can be very bad with this unset. Can usually be ignored.
 * **desc_act**: For ancient models without proper metadata, sets the model "act-order" parameter manually. Can usually be ignored.
 
-### GPTQ-for-LLaMa
-
-Loads: GPTQ models.
-
-Ancient loader, the first one to implement 4-bit quantization. It works on older GPUs for which ExLlamaV2 and AutoGPTQ do not work, and it doesn't work with "act-order", so you should use it with simple 4-bit-128g models.
-
-* **pre_layer**: Used for CPU offloading. The higher the number, the more layers will be sent to the GPU. GPTQ-for-LLaMa CPU offloading was faster than the one implemented in AutoGPTQ the last time I checked.
-
 ### llama.cpp
 
 Loads: GGUF models. Note: GGML models have been deprecated and do not work anymore.
diff --git a/docs/08 - Additional Tips.md b/docs/08 - Additional Tips.md
index f48fa862..079d1da0 100644
--- a/docs/08 - Additional Tips.md	
+++ b/docs/08 - Additional Tips.md	
@@ -13,28 +13,6 @@ Source: https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/1126
 
 This file will be automatically detected the next time you start the web UI.
 
-## Using LoRAs with GPTQ-for-LLaMa
-
-This requires using a monkey patch that is supported by this web UI: https://github.com/johnsmith0031/alpaca_lora_4bit
-
-To use it:
-
-Install alpaca_lora_4bit using pip
-
-```
-git clone https://github.com/johnsmith0031/alpaca_lora_4bit.git
-cd alpaca_lora_4bit
-git fetch origin winglian-setup_pip
-git checkout winglian-setup_pip
-pip install .
-```
-
-Start the UI with the --monkey-patch flag:
-
-```
-python server.py --model llama-7b-4bit-128g --listen --lora tloen_alpaca-lora-7b --monkey-patch
-```
-
 ## DeepSpeed
 
 `DeepSpeed ZeRO-3` is an alternative offloading strategy for full-precision (16-bit) transformers models.
diff --git a/docs/What Works.md b/docs/What Works.md
index 6c0d4c84..80abdc7f 100644
--- a/docs/What Works.md	
+++ b/docs/What Works.md	
@@ -2,15 +2,13 @@
 
 | Loader         | Loading 1 LoRA | Loading 2 or more LoRAs | Training LoRAs | Multimodal extension | Perplexity evaluation |
 |----------------|----------------|-------------------------|----------------|----------------------|-----------------------|
-| Transformers   |       ✅       |           ✅\*\*\*      |       ✅\*     |          ✅          |           ✅          |
+| Transformers   |       ✅       |           ✅\*\*        |       ✅\*     |          ✅          |           ✅          |
 | llama.cpp      |       ❌       |           ❌            |       ❌       |          ❌          |    use llamacpp_HF    |
 | llamacpp_HF    |       ❌       |           ❌            |       ❌       |          ❌          |           ✅          |
 | ExLlamav2_HF   |       ✅       |           ✅            |       ❌       |          ❌          |           ✅          |
 | ExLlamav2      |       ✅       |           ✅            |       ❌       |          ❌          |   use ExLlamav2_HF    |
 | AutoGPTQ       |       ✅       |           ❌            |       ❌       |          ✅          |           ✅          |
 | AutoAWQ        |       ?        |           ❌            |       ?        |          ?           |           ✅          |
-| GPTQ-for-LLaMa |       ✅\*\*   |           ✅\*\*\*      |       ✅       |          ✅          |           ✅          |
-| QuIP#          |       ?        |           ?             |       ?        |          ?           |           ✅          |
 | HQQ            |       ?        |           ?             |       ?        |          ?           |           ✅          |
 
 ❌ = not implemented
@@ -19,6 +17,4 @@
 
 \* Training LoRAs with GPTQ models also works with the Transformers loader. Make sure to check "auto-devices" and "disable_exllama" before loading the model.
 
-\*\* Requires the monkey-patch. The instructions can be found [here](https://github.com/oobabooga/text-generation-webui/wiki/08-%E2%80%90-Additional-Tips#using-loras-with-gptq-for-llama).
-
-\*\*\* Multi-LoRA in PEFT is tricky and the current implementation does not work reliably in all cases.
+\*\* Multi-LoRA in PEFT is tricky and the current implementation does not work reliably in all cases.
diff --git a/modules/AutoGPTQ_loader.py b/modules/AutoGPTQ_loader.py
index 514a6ee5..69e8f299 100644
--- a/modules/AutoGPTQ_loader.py
+++ b/modules/AutoGPTQ_loader.py
@@ -44,7 +44,7 @@ def load_quantized(model_name):
         'model_basename': pt_path.stem,
         'device': "xpu:0" if is_xpu_available() else "cuda:0" if not shared.args.cpu else "cpu",
         'use_triton': shared.args.triton,
-        'inject_fused_attention': not shared.args.no_inject_fused_attention,
+        'inject_fused_attention': False,
         'inject_fused_mlp': not shared.args.no_inject_fused_mlp,
         'use_safetensors': use_safetensors,
         'trust_remote_code': shared.args.trust_remote_code,
diff --git a/modules/GPTQ_loader.py b/modules/GPTQ_loader.py
deleted file mode 100644
index 601c58f3..00000000
--- a/modules/GPTQ_loader.py
+++ /dev/null
@@ -1,171 +0,0 @@
-import inspect
-import re
-from pathlib import Path
-
-import accelerate
-import torch
-import transformers
-from accelerate.utils import is_xpu_available
-from gptq_for_llama import llama_inference_offload
-from gptq_for_llama.modelutils import find_layers
-from gptq_for_llama.quant import make_quant
-from transformers import AutoConfig, AutoModelForCausalLM
-
-import modules.shared as shared
-from modules.logging_colors import logger
-
-
-# This function is a replacement for the load_quant function in the
-# GPTQ-for_LLaMa repository. It supports more models and branches.
-def _load_quant(model, checkpoint, wbits, groupsize=-1, faster_kernel=False, exclude_layers=None, kernel_switch_threshold=128, eval=True):
-    exclude_layers = exclude_layers or ['lm_head']
-
-    def noop(*args, **kwargs):
-        pass
-
-    config = AutoConfig.from_pretrained(model, trust_remote_code=shared.args.trust_remote_code)
-    torch.nn.init.kaiming_uniform_ = noop
-    torch.nn.init.uniform_ = noop
-    torch.nn.init.normal_ = noop
-
-    torch.set_default_dtype(torch.half)
-    transformers.modeling_utils._init_weights = False
-    torch.set_default_dtype(torch.half)
-    model = AutoModelForCausalLM.from_config(config, trust_remote_code=shared.args.trust_remote_code)
-    torch.set_default_dtype(torch.float)
-    if eval:
-        model = model.eval()
-
-    layers = find_layers(model)
-    for name in exclude_layers:
-        if name in layers:
-            del layers[name]
-
-    gptq_args = inspect.getfullargspec(make_quant).args
-
-    make_quant_kwargs = {
-        'module': model,
-        'names': layers,
-        'bits': wbits,
-    }
-    if 'groupsize' in gptq_args:
-        make_quant_kwargs['groupsize'] = groupsize
-    if 'faster' in gptq_args:
-        make_quant_kwargs['faster'] = faster_kernel
-    if 'kernel_switch_threshold' in gptq_args:
-        make_quant_kwargs['kernel_switch_threshold'] = kernel_switch_threshold
-
-    make_quant(**make_quant_kwargs)
-
-    del layers
-    if checkpoint.endswith('.safetensors'):
-        from safetensors.torch import load_file as safe_load
-        model.load_state_dict(safe_load(checkpoint), strict=False)
-    else:
-        model.load_state_dict(torch.load(checkpoint, weights_only=True), strict=False)
-
-    model.seqlen = 2048
-    return model
-
-
-# Used to locate the .pt/.safetensors quantized file
-def find_quantized_model_file(model_name):
-    if shared.args.checkpoint:
-        return Path(shared.args.checkpoint)
-
-    path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
-    pt_path = None
-    priority_name_list = [
-        Path(f'{shared.args.model_dir}/{model_name}{hyphen}{shared.args.wbits}bit{group}{ext}')
-        for group in ([f'-{shared.args.groupsize}g', ''] if shared.args.groupsize > 0 else [''])
-        for ext in ['.safetensors', '.pt']
-        for hyphen in ['-', f'/{model_name}-', '/']
-    ]
-
-    for path in priority_name_list:
-        if path.exists():
-            pt_path = path
-            break
-
-    # If the model hasn't been found with a well-behaved name, pick the last .pt
-    # or the last .safetensors found in its folder as a last resort
-    if not pt_path:
-        for ext in ['.pt', '.safetensors']:
-            found = list(path_to_model.glob(f"*{ext}"))
-            if len(found) > 0:
-                if len(found) > 1:
-                    logger.warning(f'More than one {ext} model has been found. The last one will be selected. It could be wrong.')
-
-                pt_path = found[-1]
-                break
-
-    return pt_path
-
-
-# The function that loads the model in modules/models.py
-def load_quantized(model_name):
-    if shared.args.model_type is None:
-        logger.error("The model could not be loaded because its type could not be inferred from its name.")
-        logger.error("Please specify the type manually using the --model_type argument.")
-        return None
-
-    # Select the appropriate load_quant function
-    model_type = shared.args.model_type.lower()
-    if shared.args.pre_layer and model_type == 'llama':
-        load_quant = llama_inference_offload.load_quant
-    elif model_type in ('llama', 'opt', 'gptj'):
-        if shared.args.pre_layer:
-            logger.warning("Ignoring --pre_layer because it only works for llama model type.")
-
-        load_quant = _load_quant
-    else:
-        logger.error("Unknown pre-quantized model type specified. Only 'llama', 'opt' and 'gptj' are supported")
-        exit()
-
-    # Find the quantized model weights file (.pt/.safetensors)
-    path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
-    pt_path = find_quantized_model_file(model_name)
-    if not pt_path:
-        logger.error("Could not find the quantized model in .pt or .safetensors format. Exiting.")
-        exit()
-    else:
-        logger.info(f"Found the following quantized model: {pt_path}")
-
-    # qwopqwop200's offload
-    if model_type == 'llama' and shared.args.pre_layer:
-        if len(shared.args.pre_layer) == 1:
-            pre_layer = shared.args.pre_layer[0]
-        else:
-            pre_layer = shared.args.pre_layer
-
-        model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.groupsize, pre_layer)
-    else:
-        threshold = False if model_type == 'gptj' else 128
-        model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.groupsize, kernel_switch_threshold=threshold)
-
-        # accelerate offload (doesn't work properly)
-        if shared.args.gpu_memory or torch.cuda.device_count() > 1 or (is_xpu_available() and torch.xpu.device_count() > 1):
-            if shared.args.gpu_memory:
-                memory_map = list(map(lambda x: x.strip(), shared.args.gpu_memory))
-                max_cpu_memory = shared.args.cpu_memory.strip() if shared.args.cpu_memory is not None else '99GiB'
-                max_memory = {}
-                for i in range(len(memory_map)):
-                    max_memory[i] = f'{memory_map[i]}GiB' if not re.match('.*ib$', memory_map[i].lower()) else memory_map[i]
-
-                max_memory['cpu'] = f'{max_cpu_memory}GiB' if not re.match('.*ib$', max_cpu_memory.lower()) else max_cpu_memory
-            else:
-                max_memory = accelerate.utils.get_balanced_memory(model)
-
-            device_map = accelerate.infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=["LlamaDecoderLayer"])
-            logger.info("Using the following device map for the quantized model:", device_map)
-            # https://huggingface.co/docs/accelerate/package_reference/big_modeling#accelerate.dispatch_model
-            model = accelerate.dispatch_model(model, device_map=device_map, offload_buffers=True)
-
-        # No offload
-        elif not shared.args.cpu:
-            if is_xpu_available():
-                model = model.to(torch.device("xpu:0"))
-            else:
-                model = model.to(torch.device('cuda:0'))
-
-    return model
diff --git a/modules/loaders.py b/modules/loaders.py
index fa7595bf..cd9d0f88 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -105,7 +105,6 @@ loaders_and_params = OrderedDict({
     ],
     'AutoGPTQ': [
         'triton',
-        'no_inject_fused_attention',
         'no_inject_fused_mlp',
         'no_use_cuda_fp16',
         'wbits',
@@ -131,21 +130,6 @@ loaders_and_params = OrderedDict({
         'trust_remote_code',
         'no_use_fast',
     ],
-    'GPTQ-for-LLaMa': [
-        'wbits',
-        'groupsize',
-        'model_type',
-        'pre_layer',
-        'trust_remote_code',
-        'no_use_fast',
-        'gptq_for_llama_info',
-    ],
-    'QuIP#': [
-        'trust_remote_code',
-        'no_use_fast',
-        'no_flash_attn',
-        'quipsharp_info',
-    ],
     'HQQ': [
         'hqq_backend',
         'trust_remote_code',
@@ -205,9 +189,7 @@ def transformers_samplers():
 loaders_samplers = {
     'Transformers': transformers_samplers(),
     'AutoGPTQ': transformers_samplers(),
-    'GPTQ-for-LLaMa': transformers_samplers(),
     'AutoAWQ': transformers_samplers(),
-    'QuIP#': transformers_samplers(),
     'HQQ': transformers_samplers(),
     'ExLlamav2': {
         'temperature',
@@ -339,15 +321,6 @@ loaders_samplers = {
     },
 }
 
-loaders_model_types = {
-    'GPTQ-for-LLaMa': [
-        "None",
-        "llama",
-        "opt",
-        "gptj"
-    ],
-}
-
 
 @functools.cache
 def list_all_samplers():
@@ -375,13 +348,6 @@ def blacklist_samplers(loader, dynamic_temperature):
     return output
 
 
-def get_model_types(loader):
-    if loader in loaders_model_types:
-        return loaders_model_types[loader]
-
-    return ["None"]
-
-
 def get_gpu_memory_keys():
     return [k for k in shared.gradio if k.startswith('gpu_memory')]
 
diff --git a/modules/models.py b/modules/models.py
index b03e1c9d..b1d0b1cb 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -73,13 +73,11 @@ def load_model(model_name, loader=None):
     load_func_map = {
         'Transformers': huggingface_loader,
         'AutoGPTQ': AutoGPTQ_loader,
-        'GPTQ-for-LLaMa': GPTQ_loader,
         'llama.cpp': llamacpp_loader,
         'llamacpp_HF': llamacpp_HF_loader,
         'ExLlamav2': ExLlamav2_loader,
         'ExLlamav2_HF': ExLlamav2_HF_loader,
         'AutoAWQ': AutoAWQ_loader,
-        'QuIP#': QuipSharp_loader,
         'HQQ': HQQ_loader,
     }
 
@@ -310,55 +308,6 @@ def AutoAWQ_loader(model_name):
     return model
 
 
-def QuipSharp_loader(model_name):
-    try:
-        with RelativeImport("repositories/quip-sharp"):
-            from lib.utils.unsafe_import import model_from_hf_path
-    except:
-        logger.error(
-            "\nQuIP# has not been found. It must be installed manually for now.\n"
-            "For instructions on how to do that, please consult:\n"
-            "https://github.com/oobabooga/text-generation-webui/pull/4803\n"
-        )
-        return None, None
-
-    # This fixes duplicate logging messages after the import above.
-    handlers = logging.getLogger().handlers
-    if len(handlers) > 1:
-        logging.getLogger().removeHandler(handlers[1])
-
-    model_dir = Path(f'{shared.args.model_dir}/{model_name}')
-    if not all((model_dir / file).exists() for file in ['tokenizer_config.json', 'special_tokens_map.json', 'tokenizer.model']):
-        logger.error(f"Could not load the model because the tokenizer files could not be found in the model folder. Please download the following files from the original (unquantized) model into {model_dir}: special_tokens_map.json, tokenizer.json, tokenizer.model, tokenizer_config.json.")
-        return None, None
-
-    model, model_str = model_from_hf_path(
-        model_dir,
-        use_cuda_graph=False,
-        use_flash_attn=not shared.args.no_flash_attn
-    )
-
-    return model
-
-
-def GPTQ_loader(model_name):
-
-    # Monkey patch
-    if shared.args.monkey_patch:
-        logger.warning("Applying the monkey patch for using LoRAs with GPTQ models. It may cause undefined behavior outside its intended scope.")
-        from modules.monkey_patch_gptq_lora import load_model_llama
-
-        model, _ = load_model_llama(model_name)
-
-    # No monkey patch
-    else:
-        import modules.GPTQ_loader
-
-        model = modules.GPTQ_loader.load_quantized(model_name)
-
-    return model
-
-
 def AutoGPTQ_loader(model_name):
     import modules.AutoGPTQ_loader
 
@@ -380,12 +329,12 @@ def ExLlamav2_HF_loader(model_name):
 
 def HQQ_loader(model_name):
     from hqq.core.quantize import HQQBackend, HQQLinear
-    from hqq.engine.hf import HQQModelForCausalLM
+    from hqq.models.hf.base import AutoHQQHFModel
 
     logger.info(f"Loading HQQ model with backend: \"{shared.args.hqq_backend}\"")
 
     model_dir = Path(f'{shared.args.model_dir}/{model_name}')
-    model = HQQModelForCausalLM.from_quantized(str(model_dir))
+    model = AutoHQQHFModel.from_quantized(str(model_dir))
     HQQLinear.set_backend(getattr(HQQBackend, shared.args.hqq_backend))
     return model
 
diff --git a/modules/models_settings.py b/modules/models_settings.py
index 8576a16a..2ecd8a58 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -40,12 +40,7 @@ def get_model_metadata(model):
         hf_metadata = None
 
     if 'loader' not in model_settings:
-        if hf_metadata is not None and 'quip_params' in hf_metadata:
-            loader = 'QuIP#'
-        else:
-            loader = infer_loader(model, model_settings)
-
-        model_settings['loader'] = loader
+        model_settings['loader'] = infer_loader(model, model_settings)
 
     # GGUF metadata
     if model_settings['loader'] in ['llama.cpp', 'llamacpp_HF']:
@@ -242,7 +237,7 @@ def apply_model_settings_to_state(model, state):
         loader = model_settings.pop('loader')
 
         # If the user is using an alternative loader for the same model type, let them keep using it
-        if not (loader == 'ExLlamav2_HF' and state['loader'] in ['GPTQ-for-LLaMa', 'ExLlamav2', 'AutoGPTQ']):
+        if not (loader == 'ExLlamav2_HF' and state['loader'] in ['ExLlamav2', 'AutoGPTQ']):
             state['loader'] = loader
 
     for k in model_settings:
diff --git a/modules/monkey_patch_gptq_lora.py b/modules/monkey_patch_gptq_lora.py
deleted file mode 100644
index 3166bd33..00000000
--- a/modules/monkey_patch_gptq_lora.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copied from https://github.com/johnsmith0031/alpaca_lora_4bit
-
-from pathlib import Path
-
-import alpaca_lora_4bit.autograd_4bit as autograd_4bit
-from alpaca_lora_4bit.amp_wrapper import AMPWrapper
-from alpaca_lora_4bit.autograd_4bit import (
-    Autograd4bitQuantLinear,
-    load_llama_model_4bit_low_ram
-)
-from alpaca_lora_4bit.models import Linear4bitLt
-from alpaca_lora_4bit.monkeypatch.peft_tuners_lora_monkey_patch import (
-    replace_peft_model_with_int4_lora_model
-)
-
-from modules import shared
-from modules.GPTQ_loader import find_quantized_model_file
-
-replace_peft_model_with_int4_lora_model()
-
-
-def load_model_llama(model_name):
-    config_path = str(Path(f'{shared.args.model_dir}/{model_name}'))
-    model_path = str(find_quantized_model_file(model_name))
-    model, tokenizer = load_llama_model_4bit_low_ram(config_path, model_path, groupsize=shared.args.groupsize, is_v1_model=False)
-    for _, m in model.named_modules():
-        if isinstance(m, Autograd4bitQuantLinear) or isinstance(m, Linear4bitLt):
-            if m.is_v1_model:
-                m.zeros = m.zeros.half()
-            m.scales = m.scales.half()
-            m.bias = m.bias.half()
-
-    autograd_4bit.auto_switch = True
-
-    model.half()
-    wrapper = AMPWrapper(model)
-    wrapper.apply_generate()
-
-    return model, tokenizer
diff --git a/modules/shared.py b/modules/shared.py
index 645ba701..373089dc 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -89,7 +89,7 @@ group.add_argument('--idle-timeout', type=int, default=0, help='Unload model aft
 
 # Model loader
 group = parser.add_argument_group('Model loader')
-group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ, AutoAWQ, GPTQ-for-LLaMa, QuIP#.')
+group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ, AutoAWQ.')
 
 # Transformers/Accelerate
 group = parser.add_argument_group('Transformers/Accelerate')
@@ -149,21 +149,17 @@ group.add_argument('--num_experts_per_token', type=int, default=2, help='Number
 # AutoGPTQ
 group = parser.add_argument_group('AutoGPTQ')
 group.add_argument('--triton', action='store_true', help='Use triton.')
-group.add_argument('--no_inject_fused_attention', action='store_true', help='Disable the use of fused attention, which will use less VRAM at the cost of slower inference.')
 group.add_argument('--no_inject_fused_mlp', action='store_true', help='Triton mode only: disable the use of fused MLP, which will use less VRAM at the cost of slower inference.')
 group.add_argument('--no_use_cuda_fp16', action='store_true', help='This can make models faster on some systems.')
 group.add_argument('--desc_act', action='store_true', help='For models that do not have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig.')
 group.add_argument('--disable_exllama', action='store_true', help='Disable ExLlama kernel, which can improve inference speed on some systems.')
 group.add_argument('--disable_exllamav2', action='store_true', help='Disable ExLlamav2 kernel.')
-
-# GPTQ-for-LLaMa
-group = parser.add_argument_group('GPTQ-for-LLaMa')
 group.add_argument('--wbits', type=int, default=0, help='Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.')
-group.add_argument('--model_type', type=str, help='Model type of pre-quantized model. Currently LLaMA, OPT, and GPT-J are supported.')
 group.add_argument('--groupsize', type=int, default=-1, help='Group size.')
-group.add_argument('--pre_layer', type=int, nargs='+', help='The number of layers to allocate to the GPU. Setting this parameter enables CPU offloading for 4-bit models. For multi-gpu, write the numbers separated by spaces, eg --pre_layer 30 60.')
-group.add_argument('--checkpoint', type=str, help='The path to the quantized checkpoint file. If not specified, it will be automatically detected.')
-group.add_argument('--monkey-patch', action='store_true', help='Apply the monkey patch for using LoRAs with quantized models.')
+
+# AutoAWQ
+group = parser.add_argument_group('AutoAWQ')
+group.add_argument('--no_inject_fused_attention', action='store_true', help='Disable the use of fused attention, which will use less VRAM at the cost of slower inference.')
 
 # HQQ
 group = parser.add_argument_group('HQQ')
@@ -208,7 +204,11 @@ group = parser.add_argument_group('Multimodal')
 group.add_argument('--multimodal-pipeline', type=str, default=None, help='The multimodal pipeline to use. Examples: llava-7b, llava-13b.')
 
 # Deprecated parameters
-# group = parser.add_argument_group('Deprecated')
+group = parser.add_argument_group('Deprecated')
+group.add_argument('--model_type', type=str, help='DEPRECATED')
+group.add_argument('--pre_layer', type=int, nargs='+', help='DEPRECATED')
+group.add_argument('--checkpoint', type=str, help='DEPRECATED')
+group.add_argument('--monkey-patch', action='store_true', help='DEPRECATED')
 
 args = parser.parse_args()
 args_defaults = parser.parse_args([])
@@ -253,8 +253,6 @@ def fix_loader_name(name):
         return 'Transformers'
     elif name in ['autogptq', 'auto-gptq', 'auto_gptq', 'auto gptq']:
         return 'AutoGPTQ'
-    elif name in ['gptq-for-llama', 'gptqforllama', 'gptqllama', 'gptq for llama', 'gptq_for_llama']:
-        return 'GPTQ-for-LLaMa'
     elif name in ['exllama', 'ex-llama', 'ex_llama', 'exlama']:
         return 'ExLlama'
     elif name in ['exllamav2', 'exllama-v2', 'ex_llama-v2', 'exlamav2', 'exlama-v2', 'exllama2', 'exllama-2']:
@@ -263,8 +261,6 @@ def fix_loader_name(name):
         return 'ExLlamav2_HF'
     elif name in ['autoawq', 'awq', 'auto-awq']:
         return 'AutoAWQ'
-    elif name in ['quip#', 'quip-sharp', 'quipsharp', 'quip_sharp']:
-        return 'QuIP#'
     elif name in ['hqq']:
         return 'HQQ'
 
diff --git a/modules/training.py b/modules/training.py
index dd360e7d..a810fb6e 100644
--- a/modules/training.py
+++ b/modules/training.py
@@ -292,12 +292,6 @@ def calc_trainable_parameters(model):
 
 def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en: bool, k_proj_en: bool, o_proj_en: bool, gate_proj_en: bool, down_proj_en: bool, up_proj_en: bool, save_steps: int, micro_batch_size: int, batch_size: int, epochs: int, learning_rate: str, lr_scheduler_type: str, lora_rank: int, lora_alpha: int, lora_dropout: float, cutoff_len: int, dataset: str, eval_dataset: str, format: str, eval_steps: int, raw_text_file: str, overlap_len: int, newline_favor_len: int, higher_rank_limit: bool, warmup_steps: int, optimizer: str, hard_cut_string: str, train_only_after: str, stop_at_loss: float, add_eos_token: bool, min_chars: int, report_to: str):
 
-    if shared.args.monkey_patch:
-        from alpaca_lora_4bit.monkeypatch.peft_tuners_lora_monkey_patch import (
-            replace_peft_model_with_int4_lora_model
-        )
-        replace_peft_model_with_int4_lora_model()
-
     global WANT_INTERRUPT
     WANT_INTERRUPT = False
 
@@ -329,10 +323,6 @@ def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en:
 
         time.sleep(5)
 
-    if shared.args.loader == 'GPTQ-for-LLaMa' and not shared.args.monkey_patch:
-        yield "LoRA training with GPTQ-for-LLaMa requires loading with `--monkey-patch`"
-        return
-
     if cutoff_len <= 0 or micro_batch_size <= 0 or batch_size <= 0 or actual_lr <= 0 or lora_rank <= 0 or lora_alpha <= 0:
         yield "Cannot input zeroes."
         return
@@ -553,15 +543,6 @@ def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en:
         yield traceback.format_exc().replace('\n', '\n\n')
         return
 
-    if shared.args.monkey_patch:
-        from alpaca_lora_4bit.autograd_4bit import Autograd4bitQuantLinear
-        from alpaca_lora_4bit.models import Linear4bitLt
-        for _, m in lora_model.named_modules():
-            if isinstance(m, Autograd4bitQuantLinear) or isinstance(m, Linear4bitLt):
-                if m.is_v1_model:
-                    m.zeros = m.zeros.half()
-                m.scales = m.scales.half()
-
     class Tracked():
         def __init__(self):
             self.current_steps = 0
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 9a936b0a..d7b4eabb 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -111,7 +111,6 @@ def create_ui():
                                 shared.gradio['compress_pos_emb'] = gr.Slider(label='compress_pos_emb', minimum=1, maximum=8, step=1, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.', value=shared.args.compress_pos_emb)
 
                             shared.gradio['autogptq_info'] = gr.Markdown('ExLlamav2_HF is recommended over AutoGPTQ for models derived from Llama.')
-                            shared.gradio['quipsharp_info'] = gr.Markdown('QuIP# has to be installed manually at the moment.')
 
                         with gr.Column():
                             shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
diff --git a/one_click.py b/one_click.py
index 0d543e30..55a4a3db 100644
--- a/one_click.py
+++ b/one_click.py
@@ -388,7 +388,7 @@ def update_requirements(initial_installation=False, pull=True):
     # Prepare the requirements file
     textgen_requirements = open(requirements_file).read().splitlines()
     if is_cuda118:
-        textgen_requirements = [req.replace('+cu121', '+cu118').replace('+cu122', '+cu118') for req in textgen_requirements]
+        textgen_requirements = [req.replace('+cu121', '+cu118').replace('+cu122', '+cu118') for req in textgen_requirements if "auto-gptq" not in req]
     if is_windows() and is_cuda118:  # No flash-attention on Windows for CUDA 11
         textgen_requirements = [req for req in textgen_requirements if 'oobabooga/flash-attention' not in req]
 
diff --git a/requirements.txt b/requirements.txt
index 65b14570..c1c63504 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,11 +1,12 @@
 accelerate==0.30.*
-aqlm[gpu,cpu]==1.1.3; platform_system == "Linux"
+aqlm[gpu,cpu]==1.1.5; platform_system == "Linux"
+auto-gptq==0.7.1
 bitsandbytes==0.43.*
 colorama
 datasets
 einops
 gradio==4.26.*
-hqq==0.1.5
+hqq==0.1.7.post2
 jinja2==3.1.2
 lm_eval==0.3.0
 markdown
@@ -23,7 +24,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.40.*
+transformers==4.41.*
 tqdm
 wandb
 
@@ -52,10 +53,6 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/te
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.75+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # CUDA wheels
-https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
@@ -65,8 +62,4 @@ https://github.com/oobabooga/flash-attention/releases/download/v2.5.6/flash_attn
 https://github.com/oobabooga/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2.0cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-autoawq==0.2.3; platform_system == "Linux" or platform_system == "Windows"
+autoawq==0.2.5; platform_system == "Linux" or platform_system == "Windows"
diff --git a/requirements_amd.txt b/requirements_amd.txt
index 5231460e..8489a97c 100644
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@@ -3,7 +3,7 @@ colorama
 datasets
 einops
 gradio==4.26.*
-hqq==0.1.5
+hqq==0.1.7.post2
 jinja2==3.1.2
 lm_eval==0.3.0
 markdown
@@ -21,7 +21,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.40.*
+transformers==4.41.*
 tqdm
 wandb
 
@@ -40,12 +40,8 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cp
 # AMD wheels
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.75+rocm5.6.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.75+rocm5.6.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
-https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.3/autoawq-0.2.3+rocm561-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.3/autoawq-0.2.3+rocm561-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.5/autoawq-0.2.5+rocm561-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.5/autoawq-0.2.5+rocm561-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt
index 93ac6288..68a82f40 100644
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@@ -3,7 +3,7 @@ colorama
 datasets
 einops
 gradio==4.26.*
-hqq==0.1.5
+hqq==0.1.7.post2
 jinja2==3.1.2
 lm_eval==0.3.0
 markdown
@@ -21,7 +21,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.40.*
+transformers==4.41.*
 tqdm
 wandb
 
@@ -38,12 +38,8 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cp
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # AMD wheels
-https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
-https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.3/autoawq-0.2.3+rocm561-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.3/autoawq-0.2.3+rocm561-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.5/autoawq-0.2.5+rocm561-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.5/autoawq-0.2.5+rocm561-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt
index 07726ec5..d3cecd08 100644
--- a/requirements_apple_intel.txt
+++ b/requirements_apple_intel.txt
@@ -3,7 +3,7 @@ colorama
 datasets
 einops
 gradio==4.26.*
-hqq==0.1.5
+hqq==0.1.7.post2
 jinja2==3.1.2
 lm_eval==0.3.0
 markdown
@@ -21,7 +21,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.40.*
+transformers==4.41.*
 tqdm
 wandb
 
diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt
index bab40944..c940cc52 100644
--- a/requirements_apple_silicon.txt
+++ b/requirements_apple_silicon.txt
@@ -3,7 +3,7 @@ colorama
 datasets
 einops
 gradio==4.26.*
-hqq==0.1.5
+hqq==0.1.7.post2
 jinja2==3.1.2
 lm_eval==0.3.0
 markdown
@@ -21,7 +21,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.40.*
+transformers==4.41.*
 tqdm
 wandb
 
diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt
index a102067b..77420e6f 100644
--- a/requirements_cpu_only.txt
+++ b/requirements_cpu_only.txt
@@ -3,7 +3,7 @@ colorama
 datasets
 einops
 gradio==4.26.*
-hqq==0.1.5
+hqq==0.1.7.post2
 jinja2==3.1.2
 lm_eval==0.3.0
 markdown
@@ -21,7 +21,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.40.*
+transformers==4.41.*
 tqdm
 wandb
 
diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt
index 24d3633a..56bdf547 100644
--- a/requirements_cpu_only_noavx2.txt
+++ b/requirements_cpu_only_noavx2.txt
@@ -3,7 +3,7 @@ colorama
 datasets
 einops
 gradio==4.26.*
-hqq==0.1.5
+hqq==0.1.7.post2
 jinja2==3.1.2
 lm_eval==0.3.0
 markdown
@@ -21,7 +21,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.40.*
+transformers==4.41.*
 tqdm
 wandb
 
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index aea0607b..c110be62 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -1,11 +1,12 @@
 accelerate==0.30.*
-aqlm[gpu,cpu]==1.1.3; platform_system == "Linux"
+aqlm[gpu,cpu]==1.1.5; platform_system == "Linux"
+auto-gptq==0.7.1
 bitsandbytes==0.43.*
 colorama
 datasets
 einops
 gradio==4.26.*
-hqq==0.1.5
+hqq==0.1.7.post2
 jinja2==3.1.2
 lm_eval==0.3.0
 markdown
@@ -23,7 +24,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.40.*
+transformers==4.41.*
 tqdm
 wandb
 
@@ -52,10 +53,6 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/te
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.75+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # CUDA wheels
-https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
@@ -65,8 +62,4 @@ https://github.com/oobabooga/flash-attention/releases/download/v2.5.6/flash_attn
 https://github.com/oobabooga/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2.0cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-autoawq==0.2.3; platform_system == "Linux" or platform_system == "Windows"
+autoawq==0.2.5; platform_system == "Linux" or platform_system == "Windows"
diff --git a/requirements_nowheels.txt b/requirements_nowheels.txt
index 135602fe..821049bc 100644
--- a/requirements_nowheels.txt
+++ b/requirements_nowheels.txt
@@ -3,7 +3,7 @@ colorama
 datasets
 einops
 gradio==4.26.*
-hqq==0.1.5
+hqq==0.1.7.post2
 jinja2==3.1.2
 lm_eval==0.3.0
 markdown
@@ -21,7 +21,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.40.*
+transformers==4.41.*
 tqdm
 wandb
 

From ae8629215975852c7d33445f32865c746b056ec7 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 21 May 2024 10:35:00 -0700
Subject: [PATCH 13/64] Fix getting Phi-3-small-128k-instruct logits

---
 modules/logits.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/modules/logits.py b/modules/logits.py
index 3e793bd0..1fe2e73e 100644
--- a/modules/logits.py
+++ b/modules/logits.py
@@ -1,4 +1,5 @@
 import time
+import traceback
 
 import torch
 from transformers import is_torch_npu_available, is_torch_xpu_available
@@ -18,7 +19,8 @@ def get_next_logits(*args, **kwargs):
     shared.generation_lock.acquire()
     try:
         result = _get_next_logits(*args, **kwargs)
-    except:
+    except Exception:
+        traceback.print_exc()
         result = None
 
     models.last_generation_time = time.time()
@@ -84,7 +86,14 @@ def _get_next_logits(prompt, state, use_samplers, previous, top_logits=25, retur
         topk_values = [float(i) for i in topk_values]
         output = {}
         for row in list(zip(topk_values, tokens)):
-            output[row[1]] = row[0]
+            key = row[1]
+            if isinstance(key, bytes):
+                try:
+                    key = key.decode()
+                except:
+                    key = key.decode('latin')
+
+            output[key] = row[0]
 
         return output
     else:

From 9e189947d1d51fdc88aa76490a58f8dce4685110 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 21 May 2024 10:37:30 -0700
Subject: [PATCH 14/64] Minor fix after
 bd7cc4234d0d2cc890c5e023f67741615c44484a (thanks @belladoreai)

---
 modules/models_settings.py | 3 +--
 modules/ui.py              | 1 -
 modules/ui_model_menu.py   | 5 +----
 3 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/modules/models_settings.py b/modules/models_settings.py
index 2ecd8a58..2d161ad5 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -12,7 +12,6 @@ def get_fallback_settings():
         'wbits': 'None',
         'groupsize': 'None',
         'desc_act': False,
-        'model_type': 'None',
         'max_seq_len': 2048,
         'n_ctx': 2048,
         'rope_freq_base': 0,
@@ -199,7 +198,7 @@ def update_model_parameters(state, initial=False):
             continue
 
         # Setting null defaults
-        if element in ['wbits', 'groupsize', 'model_type'] and value == 'None':
+        if element in ['wbits', 'groupsize'] and value == 'None':
             value = vars(shared.args_defaults)[element]
         elif element in ['cpu_memory'] and value == 0:
             value = vars(shared.args_defaults)[element]
diff --git a/modules/ui.py b/modules/ui.py
index 992616de..0126190a 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -70,7 +70,6 @@ def list_model_elements():
         'use_double_quant',
         'wbits',
         'groupsize',
-        'model_type',
         'pre_layer',
         'triton',
         'desc_act',
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index d7b4eabb..76ac7530 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -101,7 +101,6 @@ def create_ui():
                             shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
                             shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=shared.args.wbits if shared.args.wbits > 0 else "None")
                             shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=shared.args.groupsize if shared.args.groupsize > 0 else "None")
-                            shared.gradio['model_type'] = gr.Dropdown(label="model_type", choices=["None"], value=shared.args.model_type or "None")
                             shared.gradio['pre_layer'] = gr.Slider(label="pre_layer", minimum=0, maximum=100, value=shared.args.pre_layer[0] if shared.args.pre_layer is not None else 0)
                             shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
                             shared.gradio['max_seq_len'] = gr.Slider(label='max_seq_len', minimum=0, maximum=shared.settings['truncation_length_max'], step=256, info='Context length. Try lowering this if you run out of memory while loading the model.', value=shared.args.max_seq_len)
@@ -186,9 +185,7 @@ def create_ui():
 
 
 def create_event_handlers():
-    shared.gradio['loader'].change(
-        loaders.make_loader_params_visible, gradio('loader'), gradio(loaders.get_all_params())).then(
-        lambda value: gr.update(choices=loaders.get_model_types(value)), gradio('loader'), gradio('model_type'))
+    shared.gradio['loader'].change(loaders.make_loader_params_visible, gradio('loader'), gradio(loaders.get_all_params()))
 
     # In this event handler, the interface state is read and updated
     # with the model defaults (if any), and then the model is loaded

From 8aaa0a6f4e504ecd1a973c85b09730467e72cdd2 Mon Sep 17 00:00:00 2001
From: rohitanshu <85547195+iamrohitanshu@users.noreply.github.com>
Date: Tue, 21 May 2024 23:22:22 +0530
Subject: [PATCH 15/64] Fixed minor typo in docs - Training Tab.md (#6038)

---
 docs/05 - Training Tab.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/05 - Training Tab.md b/docs/05 - Training Tab.md
index 590117a3..eba62cb3 100644
--- a/docs/05 - Training Tab.md	
+++ b/docs/05 - Training Tab.md	
@@ -124,7 +124,7 @@ When you're running training, the WebUI's console window will log reports that i
 
 "Loss" in the world of AI training theoretically means "how close is the model to perfect", with `0` meaning "absolutely perfect". This is calculated by measuring the difference between the model outputting exactly the text you're training it to output, and what it actually outputs.
 
-In practice, a good LLM should have a very complex variable range of ideas running in its artificial head, so a loss of `0` would indicate that the model has broken and forgotten to how think about anything other than what you trained it.
+In practice, a good LLM should have a very complex variable range of ideas running in its artificial head, so a loss of `0` would indicate that the model has broken and forgotten how to think about anything other than what you trained it on.
 
 So, in effect, Loss is a balancing game: you want to get it low enough that it understands your data, but high enough that it isn't forgetting everything else. Generally, if it goes below `1.0`, it's going to start forgetting its prior memories, and you should stop training. In some cases you may prefer to take it as low as `0.5` (if you want it to be very very predictable). Different goals have different needs, so don't be afraid to experiment and see what works best for you.
 

From 5499bc9bc8d2b24f163c0026dce05df21a25a691 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 22 May 2024 13:53:59 -0300
Subject: [PATCH 16/64] Fix stopping strings for llama-3 and phi (#6043)

---
 modules/chat.py | 73 +++++++++++++++++++++++++------------------------
 1 file changed, 37 insertions(+), 36 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index 43f5466b..6a388a04 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -45,34 +45,35 @@ yaml.add_representer(str, str_presenter)
 yaml.representer.SafeRepresenter.add_representer(str, str_presenter)
 
 
-def get_generation_prompt(renderer, impersonate=False, strip_trailing_spaces=True):
+def extract_message_prefix_suffix(renderer, strip_trailing_spaces=True):
     '''
-    Given a Jinja template, reverse-engineers the prefix and the suffix for
-    an assistant message (if impersonate=False) or an user message
-    (if impersonate=True)
+    Given a Jinja template, extracts the prefix and suffix for
+    an assistant message and a user message. It assumes that they
+    share the same suffix.
     '''
 
-    if impersonate:
-        messages = [
-            {"role": "user", "content": "<<|user-message-1|>>"},
-            {"role": "user", "content": "<<|user-message-2|>>"},
-        ]
-    else:
-        messages = [
-            {"role": "assistant", "content": "<<|user-message-1|>>"},
-            {"role": "assistant", "content": "<<|user-message-2|>>"},
-        ]
+    messages = [
+        {"role": "user", "content": "<<|user-message-1|>>"},
+        {"role": "assistant", "content": "<<|assistant-message-1|>>"},
+        {"role": "user", "content": "<<|user-message-2|>>"},
+        {"role": "assistant", "content": "<<|assistant-message-2|>>"},
+    ]
 
     prompt = renderer(messages=messages)
+    unwanted_suffix = renderer(messages=[])
 
-    suffix_plus_prefix = prompt.split("<<|user-message-1|>>")[1].split("<<|user-message-2|>>")[0]
-    suffix = prompt.split("<<|user-message-2|>>")[1]
-    prefix = suffix_plus_prefix[len(suffix):]
+    suffix = prompt.split('<<|assistant-message-2|>>')[1]
+    if unwanted_suffix != '':
+        suffix = suffix[:-len(unwanted_suffix)]
+
+    prefix_user = prompt.split('<<|assistant-message-1|>>')[1].split('<<|user-message-2|>>')[0][len(suffix):]
+    prefix_assistant = prompt.split('<<|user-message-1|>>')[1].split('<<|assistant-message-1|>>')[0][len(suffix):]
 
     if strip_trailing_spaces:
-        prefix = prefix.rstrip(' ')
+        prefix_user = prefix_user.rstrip(' ')
+        prefix_assistant = prefix_assistant.rstrip(' ')
 
-    return prefix, suffix
+    return prefix_user, prefix_assistant, suffix
 
 
 def generate_chat_prompt(user_input, state, **kwargs):
@@ -125,7 +126,12 @@ def generate_chat_prompt(user_input, state, **kwargs):
         messages.append({"role": "user", "content": user_input})
 
     def remove_extra_bos(prompt):
-        for bos_token in ['<s>', '<|startoftext|>', '<BOS_TOKEN>', '<|endoftext|>']:
+        if hasattr(shared.tokenizer, 'bos_token_id'):
+           bos_tokens = [shared.tokenizer.decode(shared.tokenizer.bos_token_id)]
+        else:
+            bos_tokens = ['<s>', '<|startoftext|>', '<BOS_TOKEN>']
+
+        for bos_token in bos_tokens:
             while prompt.startswith(bos_token):
                 prompt = prompt[len(bos_token):]
 
@@ -137,6 +143,9 @@ def generate_chat_prompt(user_input, state, **kwargs):
         else:
             prompt = renderer(messages=messages)
 
+        prefix_user, prefix_assistant, suffix = extract_message_prefix_suffix(renderer, strip_trailing_spaces=not _continue)
+        prefix = prefix_user if impersonate else prefix_assistant
+
         if state['mode'] == 'chat-instruct':
             outer_messages = []
             if state['custom_system_message'].strip() != '':
@@ -148,29 +157,25 @@ def generate_chat_prompt(user_input, state, **kwargs):
             command = command.replace('<|prompt|>', prompt)
             command = replace_character_names(command, state['name1'], state['name2'])
 
+
             if _continue:
-                prefix = get_generation_prompt(renderer, impersonate=impersonate, strip_trailing_spaces=False)[0]
                 prefix += messages[-1]["content"]
-            else:
-                prefix = get_generation_prompt(renderer, impersonate=impersonate)[0]
-                if not impersonate:
-                    prefix = apply_extensions('bot_prefix', prefix, state)
+            elif not impersonate:
+                prefix = apply_extensions('bot_prefix', prefix, state)
 
             outer_messages.append({"role": "user", "content": command})
             outer_messages.append({"role": "assistant", "content": prefix})
 
             prompt = instruction_template.render(messages=outer_messages)
-            suffix = get_generation_prompt(instruct_renderer, impersonate=False)[1]
             if len(suffix) > 0:
                 prompt = prompt[:-len(suffix)]
 
         else:
+
             if _continue:
-                suffix = get_generation_prompt(renderer, impersonate=impersonate)[1]
                 if len(suffix) > 0:
                     prompt = prompt[:-len(suffix)]
             else:
-                prefix = get_generation_prompt(renderer, impersonate=impersonate)[0]
                 if state['mode'] == 'chat' and not impersonate:
                     prefix = apply_extensions('bot_prefix', prefix, state)
 
@@ -249,15 +254,11 @@ def get_stopping_strings(state):
         renderers.append(renderer)
 
     for renderer in renderers:
-        prefix_bot, suffix_bot = get_generation_prompt(renderer, impersonate=False)
-        prefix_user, suffix_user = get_generation_prompt(renderer, impersonate=True)
+        prefix_user, prefix_assistant, suffix = extract_message_prefix_suffix(renderer)
 
-        stopping_strings += [
-            suffix_user + prefix_bot,
-            suffix_user + prefix_user,
-            suffix_bot + prefix_bot,
-            suffix_bot + prefix_user,
-        ]
+        for item in [suffix + prefix_assistant, suffix + prefix_user, suffix]:
+            stopping_strings.append(item)
+            stopping_strings.append(item.rstrip())
 
     if 'stopping_strings' in state and isinstance(state['stopping_strings'], list):
         stopping_strings += state.pop('stopping_strings')

From ad54d524f7586d2fa8ab33bad46d0a4b108aa822 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 22 May 2024 17:18:08 -0700
Subject: [PATCH 17/64] Revert "Fix stopping strings for llama-3 and phi
 (#6043)"

This reverts commit 5499bc9bc8d2b24f163c0026dce05df21a25a691.
---
 modules/chat.py | 73 ++++++++++++++++++++++++-------------------------
 1 file changed, 36 insertions(+), 37 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index 6a388a04..43f5466b 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -45,35 +45,34 @@ yaml.add_representer(str, str_presenter)
 yaml.representer.SafeRepresenter.add_representer(str, str_presenter)
 
 
-def extract_message_prefix_suffix(renderer, strip_trailing_spaces=True):
+def get_generation_prompt(renderer, impersonate=False, strip_trailing_spaces=True):
     '''
-    Given a Jinja template, extracts the prefix and suffix for
-    an assistant message and a user message. It assumes that they
-    share the same suffix.
+    Given a Jinja template, reverse-engineers the prefix and the suffix for
+    an assistant message (if impersonate=False) or an user message
+    (if impersonate=True)
     '''
 
-    messages = [
-        {"role": "user", "content": "<<|user-message-1|>>"},
-        {"role": "assistant", "content": "<<|assistant-message-1|>>"},
-        {"role": "user", "content": "<<|user-message-2|>>"},
-        {"role": "assistant", "content": "<<|assistant-message-2|>>"},
-    ]
+    if impersonate:
+        messages = [
+            {"role": "user", "content": "<<|user-message-1|>>"},
+            {"role": "user", "content": "<<|user-message-2|>>"},
+        ]
+    else:
+        messages = [
+            {"role": "assistant", "content": "<<|user-message-1|>>"},
+            {"role": "assistant", "content": "<<|user-message-2|>>"},
+        ]
 
     prompt = renderer(messages=messages)
-    unwanted_suffix = renderer(messages=[])
 
-    suffix = prompt.split('<<|assistant-message-2|>>')[1]
-    if unwanted_suffix != '':
-        suffix = suffix[:-len(unwanted_suffix)]
-
-    prefix_user = prompt.split('<<|assistant-message-1|>>')[1].split('<<|user-message-2|>>')[0][len(suffix):]
-    prefix_assistant = prompt.split('<<|user-message-1|>>')[1].split('<<|assistant-message-1|>>')[0][len(suffix):]
+    suffix_plus_prefix = prompt.split("<<|user-message-1|>>")[1].split("<<|user-message-2|>>")[0]
+    suffix = prompt.split("<<|user-message-2|>>")[1]
+    prefix = suffix_plus_prefix[len(suffix):]
 
     if strip_trailing_spaces:
-        prefix_user = prefix_user.rstrip(' ')
-        prefix_assistant = prefix_assistant.rstrip(' ')
+        prefix = prefix.rstrip(' ')
 
-    return prefix_user, prefix_assistant, suffix
+    return prefix, suffix
 
 
 def generate_chat_prompt(user_input, state, **kwargs):
@@ -126,12 +125,7 @@ def generate_chat_prompt(user_input, state, **kwargs):
         messages.append({"role": "user", "content": user_input})
 
     def remove_extra_bos(prompt):
-        if hasattr(shared.tokenizer, 'bos_token_id'):
-           bos_tokens = [shared.tokenizer.decode(shared.tokenizer.bos_token_id)]
-        else:
-            bos_tokens = ['<s>', '<|startoftext|>', '<BOS_TOKEN>']
-
-        for bos_token in bos_tokens:
+        for bos_token in ['<s>', '<|startoftext|>', '<BOS_TOKEN>', '<|endoftext|>']:
             while prompt.startswith(bos_token):
                 prompt = prompt[len(bos_token):]
 
@@ -143,9 +137,6 @@ def generate_chat_prompt(user_input, state, **kwargs):
         else:
             prompt = renderer(messages=messages)
 
-        prefix_user, prefix_assistant, suffix = extract_message_prefix_suffix(renderer, strip_trailing_spaces=not _continue)
-        prefix = prefix_user if impersonate else prefix_assistant
-
         if state['mode'] == 'chat-instruct':
             outer_messages = []
             if state['custom_system_message'].strip() != '':
@@ -157,25 +148,29 @@ def generate_chat_prompt(user_input, state, **kwargs):
             command = command.replace('<|prompt|>', prompt)
             command = replace_character_names(command, state['name1'], state['name2'])
 
-
             if _continue:
+                prefix = get_generation_prompt(renderer, impersonate=impersonate, strip_trailing_spaces=False)[0]
                 prefix += messages[-1]["content"]
-            elif not impersonate:
-                prefix = apply_extensions('bot_prefix', prefix, state)
+            else:
+                prefix = get_generation_prompt(renderer, impersonate=impersonate)[0]
+                if not impersonate:
+                    prefix = apply_extensions('bot_prefix', prefix, state)
 
             outer_messages.append({"role": "user", "content": command})
             outer_messages.append({"role": "assistant", "content": prefix})
 
             prompt = instruction_template.render(messages=outer_messages)
+            suffix = get_generation_prompt(instruct_renderer, impersonate=False)[1]
             if len(suffix) > 0:
                 prompt = prompt[:-len(suffix)]
 
         else:
-
             if _continue:
+                suffix = get_generation_prompt(renderer, impersonate=impersonate)[1]
                 if len(suffix) > 0:
                     prompt = prompt[:-len(suffix)]
             else:
+                prefix = get_generation_prompt(renderer, impersonate=impersonate)[0]
                 if state['mode'] == 'chat' and not impersonate:
                     prefix = apply_extensions('bot_prefix', prefix, state)
 
@@ -254,11 +249,15 @@ def get_stopping_strings(state):
         renderers.append(renderer)
 
     for renderer in renderers:
-        prefix_user, prefix_assistant, suffix = extract_message_prefix_suffix(renderer)
+        prefix_bot, suffix_bot = get_generation_prompt(renderer, impersonate=False)
+        prefix_user, suffix_user = get_generation_prompt(renderer, impersonate=True)
 
-        for item in [suffix + prefix_assistant, suffix + prefix_user, suffix]:
-            stopping_strings.append(item)
-            stopping_strings.append(item.rstrip())
+        stopping_strings += [
+            suffix_user + prefix_bot,
+            suffix_user + prefix_user,
+            suffix_bot + prefix_bot,
+            suffix_bot + prefix_user,
+        ]
 
     if 'stopping_strings' in state and isinstance(state['stopping_strings'], list):
         stopping_strings += state.pop('stopping_strings')

From 4f1e96b9e3f9dc1593bed2096a40f68f8dc4d015 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 23 May 2024 20:42:46 -0700
Subject: [PATCH 18/64] Downloader: Add --model-dir argument, respect
 --model-dir in the UI

---
 download-model.py        | 12 ++++++++----
 modules/ui_model_menu.py |  8 +++++++-
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/download-model.py b/download-model.py
index 465c76f9..08b23d08 100644
--- a/download-model.py
+++ b/download-model.py
@@ -167,8 +167,11 @@ class ModelDownloader:
         is_llamacpp = has_gguf and specific_file is not None
         return links, sha256, is_lora, is_llamacpp
 
-    def get_output_folder(self, model, branch, is_lora, is_llamacpp=False):
-        base_folder = 'models' if not is_lora else 'loras'
+    def get_output_folder(self, model, branch, is_lora, is_llamacpp=False, model_dir=None):
+        if model_dir:
+            base_folder = model_dir
+        else:
+            base_folder = 'models' if not is_lora else 'loras'
 
         # If the model is of type GGUF, save directly in the base_folder
         if is_llamacpp:
@@ -304,7 +307,8 @@ if __name__ == '__main__':
     parser.add_argument('--threads', type=int, default=4, help='Number of files to download simultaneously.')
     parser.add_argument('--text-only', action='store_true', help='Only download text files (txt/json).')
     parser.add_argument('--specific-file', type=str, default=None, help='Name of the specific file to download (if not provided, downloads all).')
-    parser.add_argument('--output', type=str, default=None, help='The folder where the model should be saved.')
+    parser.add_argument('--output', type=str, default=None, help='Save the model files to this folder.')
+    parser.add_argument('--model-dir', type=str, default=None, help='Save the model files to a subfolder of this folder instead of the default one (text-generation-webui/models).')
     parser.add_argument('--clean', action='store_true', help='Does not resume the previous download.')
     parser.add_argument('--check', action='store_true', help='Validates the checksums of model files.')
     parser.add_argument('--max-retries', type=int, default=5, help='Max retries count when get error in download time.')
@@ -333,7 +337,7 @@ if __name__ == '__main__':
     if args.output:
         output_folder = Path(args.output)
     else:
-        output_folder = downloader.get_output_folder(model, branch, is_lora, is_llamacpp=is_llamacpp)
+        output_folder = downloader.get_output_folder(model, branch, is_lora, is_llamacpp=is_llamacpp, model_dir=args.model_dir)
 
     if args.check:
         # Check previously downloaded files
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 76ac7530..6cda7ecd 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -290,7 +290,13 @@ def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), retur
             return
 
         yield ("Getting the output folder")
-        output_folder = downloader.get_output_folder(model, branch, is_lora, is_llamacpp=is_llamacpp)
+        output_folder = downloader.get_output_folder(
+            model,
+            branch,
+            is_lora,
+            is_llamacpp=is_llamacpp,
+            model_dir=shared.args.model_dir if shared.args.model_dir != shared.args_defaults.model_dir else None
+        )
 
         if output_folder == Path("models"):
             output_folder = Path(shared.args.model_dir)

From 8df68b05e906e5156ca0254d011267af786bbaa0 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 27 May 2024 05:03:30 -0700
Subject: [PATCH 19/64] Remove MinPLogitsWarper (it's now a transformers
 built-in)

---
 modules/sampler_hijack.py | 38 --------------------------------------
 1 file changed, 38 deletions(-)

diff --git a/modules/sampler_hijack.py b/modules/sampler_hijack.py
index 1f9ca52c..55d1997b 100644
--- a/modules/sampler_hijack.py
+++ b/modules/sampler_hijack.py
@@ -125,36 +125,6 @@ class QuadraticSamplingLogitsWarper(LogitsWarper):
         return transformed_logits
 
 
-class MinPLogitsWarper(LogitsWarper):
-    def __init__(self, min_p: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
-        if min_p < 0 or min_p > 1.0:
-            raise ValueError(f"`min_p` has to be a float >= 0 and <= 1, but is {min_p}")
-        self.min_p = min_p
-        self.filter_value = filter_value
-        self.min_tokens_to_keep = min_tokens_to_keep
-
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        # Convert logits to probabilities
-        probs = torch.softmax(scores, dim=-1)
-        # Get the probability of the top token for each sequence in the batch
-        top_probs, _ = probs.max(dim=-1, keepdim=True)
-        # Calculate the actual min_p threshold by scaling min_p with the top token's probability
-        scaled_min_p = self.min_p * top_probs
-        # Create a mask for tokens that have a probability less than the scaled min_p
-        tokens_to_remove = probs < scaled_min_p
-
-        sorted_indices = torch.argsort(scores, descending=True, dim=-1)
-        sorted_indices_to_remove = torch.gather(tokens_to_remove, dim=-1, index=sorted_indices)
-
-        if self.min_tokens_to_keep > 1:
-            # Keep at least min_tokens_to_keep
-            sorted_indices_to_remove[..., : self.min_tokens_to_keep] = False
-
-        indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
-        scores = scores.masked_fill(indices_to_remove, self.filter_value)
-        return scores
-
-
 class TailFreeLogitsWarper(LogitsWarper):
     def __init__(self, tfs: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
         tfs = float(tfs)
@@ -421,14 +391,6 @@ def get_logits_warper_patch(self, generation_config):
             )
         )
 
-    if generation_config.min_p is not None and 0.0 < generation_config.min_p <= 1.0:
-        warpers_to_add.append(
-            MinPLogitsWarper(
-                min_p=generation_config.min_p,
-                min_tokens_to_keep=min_tokens_to_keep
-            )
-        )
-
     if generation_config.dynamic_temperature:
         warpers_to_add.append(
             DynamicTemperatureLogitsWarper(

From a363cdfca117e92967d9a25509146f02fbf63da1 Mon Sep 17 00:00:00 2001
From: Belladore <135602125+belladoreai@users.noreply.github.com>
Date: Mon, 27 May 2024 15:21:30 +0300
Subject: [PATCH 20/64] Fix missing bos token for some models (including
 Llama-3) (#6050)

---
 modules/text_generation.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/modules/text_generation.py b/modules/text_generation.py
index 75294013..4f324239 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -138,9 +138,21 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
             input_ids = np.array(input_ids).reshape(1, len(input_ids))
     else:
         input_ids = shared.tokenizer.encode(str(prompt), return_tensors='pt', add_special_tokens=add_special_tokens)
-        if not add_bos_token:
-            while len(input_ids[0]) > 0 and input_ids[0][0] == shared.tokenizer.bos_token_id:
-                input_ids = input_ids[:, 1:]
+
+        if hasattr(shared.tokenizer, 'bos_token_id'):
+            if add_bos_token:
+                if (len(input_ids[0]) > 0 and input_ids[0][0] != shared.tokenizer.bos_token_id) or len(input_ids[0]) == 0:
+                    # Add a missing bos token (it may not have been added due to faulty model metadata)
+                    bos_tensor = torch.tensor([[shared.tokenizer.bos_token_id]])
+                    input_ids = torch.cat((bos_tensor, input_ids), 1)
+
+                # Prevent double bos token due to jinja templates with <s> somewhere
+                while len(input_ids[0]) > 1 and input_ids[0][0] == shared.tokenizer.bos_token_id and input_ids[0][1] == shared.tokenizer.bos_token_id:
+                    input_ids = input_ids[:, 1:]
+            else:
+                # Remove any bos token that may have been added
+                while len(input_ids[0]) > 0 and input_ids[0][0] == shared.tokenizer.bos_token_id:
+                    input_ids = input_ids[:, 1:]
 
     # Handling truncation
     if truncation_length is not None:

From 46174a2d33b25667bf773af548c549c3dd9f95a3 Mon Sep 17 00:00:00 2001
From: Belladore <135602125+belladoreai@users.noreply.github.com>
Date: Thu, 13 Jun 2024 04:52:27 +0300
Subject: [PATCH 21/64] Fix error when bos_token_id is None. (#6061)

---
 modules/text_generation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/text_generation.py b/modules/text_generation.py
index 4f324239..ca42ba1f 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -139,7 +139,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
     else:
         input_ids = shared.tokenizer.encode(str(prompt), return_tensors='pt', add_special_tokens=add_special_tokens)
 
-        if hasattr(shared.tokenizer, 'bos_token_id'):
+        if hasattr(shared.tokenizer, 'bos_token_id') and shared.tokenizer.bos_token_id is not None:
             if add_bos_token:
                 if (len(input_ids[0]) > 0 and input_ids[0][0] != shared.tokenizer.bos_token_id) or len(input_ids[0]) == 0:
                     # Add a missing bos token (it may not have been added due to faulty model metadata)

From 2d196ed2feb222c8691c218c2ebc7b1ee773f804 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 12 Jun 2024 18:56:44 -0700
Subject: [PATCH 22/64] Remove obsolete pre_layer parameter

---
 modules/models_settings.py | 5 +----
 modules/ui.py              | 1 -
 modules/ui_model_menu.py   | 1 -
 3 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/modules/models_settings.py b/modules/models_settings.py
index 2d161ad5..c3712db2 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -204,14 +204,11 @@ def update_model_parameters(state, initial=False):
             value = vars(shared.args_defaults)[element]
 
         # Making some simple conversions
-        if element in ['wbits', 'groupsize', 'pre_layer']:
+        if element in ['wbits', 'groupsize']:
             value = int(value)
         elif element == 'cpu_memory' and value is not None:
             value = f"{value}MiB"
 
-        if element in ['pre_layer']:
-            value = [value] if value > 0 else None
-
         setattr(shared.args, element, value)
 
     found_positive = False
diff --git a/modules/ui.py b/modules/ui.py
index 0126190a..a202d325 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -70,7 +70,6 @@ def list_model_elements():
         'use_double_quant',
         'wbits',
         'groupsize',
-        'pre_layer',
         'triton',
         'desc_act',
         'no_inject_fused_attention',
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 6cda7ecd..d240bb6a 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -101,7 +101,6 @@ def create_ui():
                             shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
                             shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=shared.args.wbits if shared.args.wbits > 0 else "None")
                             shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=shared.args.groupsize if shared.args.groupsize > 0 else "None")
-                            shared.gradio['pre_layer'] = gr.Slider(label="pre_layer", minimum=0, maximum=100, value=shared.args.pre_layer[0] if shared.args.pre_layer is not None else 0)
                             shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
                             shared.gradio['max_seq_len'] = gr.Slider(label='max_seq_len', minimum=0, maximum=shared.settings['truncation_length_max'], step=256, info='Context length. Try lowering this if you run out of memory while loading the model.', value=shared.args.max_seq_len)
                             with gr.Blocks():

From a36fa7307100744a5ee1b2a8b66be186d47db249 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 12 Jun 2024 19:00:21 -0700
Subject: [PATCH 23/64] Lint

---
 modules/models.py         | 4 +---
 modules/sampler_hijack.py | 4 ++--
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/modules/models.py b/modules/models.py
index b1d0b1cb..cb32a3da 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -1,5 +1,4 @@
 import gc
-import logging
 import os
 import pprint
 import re
@@ -29,7 +28,6 @@ import modules.shared as shared
 from modules import RoPE, sampler_hijack
 from modules.logging_colors import logger
 from modules.models_settings import get_model_metadata
-from modules.relative_imports import RelativeImport
 
 transformers.logging.set_verbosity_error()
 
@@ -267,7 +265,7 @@ def llamacpp_loader(model_name):
     if path.is_file():
         model_file = path
     else:
-        model_file = sorted(Path(f'{shared.args.model_dir}/{model_name}').glob('*.gguf'))[0] 
+        model_file = sorted(Path(f'{shared.args.model_dir}/{model_name}').glob('*.gguf'))[0]
 
     logger.info(f"llama.cpp weights detected: \"{model_file}\"")
     model, tokenizer = LlamaCppModel.from_pretrained(model_file)
diff --git a/modules/sampler_hijack.py b/modules/sampler_hijack.py
index 55d1997b..bf8ecf3a 100644
--- a/modules/sampler_hijack.py
+++ b/modules/sampler_hijack.py
@@ -218,7 +218,7 @@ class DRYLogitsProcessor(LogitsProcessor):
             match_lengths = {}
 
             for i in match_indices:
-                next_token = input_ids_row[i+1].item()
+                next_token = input_ids_row[i + 1].item()
 
                 if next_token in self.sequence_breakers:
                     continue
@@ -234,7 +234,7 @@ class DRYLogitsProcessor(LogitsProcessor):
                         # Start of input reached.
                         break
 
-                    previous_token = input_ids_row[-(match_length+1)].item()
+                    previous_token = input_ids_row[-(match_length + 1)].item()
                     if input_ids_row[j] != previous_token:
                         # Start of match reached.
                         break

From b675151f25b2814a56aa567fa7f68b28f6952872 Mon Sep 17 00:00:00 2001
From: theo77186 <theo77186@users.noreply.github.com>
Date: Thu, 13 Jun 2024 04:25:05 +0200
Subject: [PATCH 24/64] Use reentrant generation lock (#6107)

---
 server.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/server.py b/server.py
index 04a5b16d..a8237ba6 100644
--- a/server.py
+++ b/server.py
@@ -32,7 +32,7 @@ import sys
 import time
 from functools import partial
 from pathlib import Path
-from threading import Lock, Thread
+from threading import RLock, Thread
 
 import yaml
 
@@ -243,7 +243,7 @@ if __name__ == "__main__":
         if shared.args.lora:
             add_lora_to_model(shared.args.lora)
 
-    shared.generation_lock = Lock()
+    shared.generation_lock = RLock()
 
     if shared.args.idle_timeout > 0:
         timer_thread = Thread(target=unload_model_if_idle)

From 3abafee69605144a0edc9f53dd318853d3e80e68 Mon Sep 17 00:00:00 2001
From: Belladore <135602125+belladoreai@users.noreply.github.com>
Date: Thu, 13 Jun 2024 05:39:11 +0300
Subject: [PATCH 25/64] DRY sampler improvements (#6053)

---
 modules/sampler_hijack.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/modules/sampler_hijack.py b/modules/sampler_hijack.py
index bf8ecf3a..ad74d658 100644
--- a/modules/sampler_hijack.py
+++ b/modules/sampler_hijack.py
@@ -204,21 +204,25 @@ class DRYLogitsProcessor(LogitsProcessor):
             input_ids = input_ids[:, -self._range:]
 
         for input_ids_row, scores_row in zip(input_ids, scores):
-            # Raw integer must be extracted here to check for set membership.
-            last_token = input_ids_row[-1].item()
+            # Use normal Python data types for improved performance
+            input_ids = input_ids_row.tolist()
 
+            last_token = input_ids[-1]
             if last_token in self.sequence_breakers:
                 continue
 
             # Exclude the last token as it always matches.
-            match_indices = (input_ids_row[:-1] == last_token).nonzero()
+            match_indices = []
+            for idx, val in enumerate(input_ids[:-1]):
+                if val == last_token:
+                    match_indices.append(idx)
 
             # Stores the maximum matching sequence length
             # for each token immediately following the sequence in the input.
             match_lengths = {}
 
             for i in match_indices:
-                next_token = input_ids_row[i + 1].item()
+                next_token = input_ids[i + 1]
 
                 if next_token in self.sequence_breakers:
                     continue
@@ -227,15 +231,15 @@ class DRYLogitsProcessor(LogitsProcessor):
                 # so the match is at least of length 1.
                 match_length = 1
 
-                # Extend the match backwards as far as possible.
-                while True:
+                # Extend the match backwards (at most to 50 to prevent exponent overflow at penalty calculation) (this cap also improves performance on worst case)
+                while match_length < 50:
                     j = i - match_length
                     if j < 0:
                         # Start of input reached.
                         break
 
-                    previous_token = input_ids_row[-(match_length + 1)].item()
-                    if input_ids_row[j] != previous_token:
+                    previous_token = input_ids[-(match_length + 1)]
+                    if input_ids[j] != previous_token:
                         # Start of match reached.
                         break
 

From 1d79aa67cf8bbaa162dbe4d2950c52c748fe0446 Mon Sep 17 00:00:00 2001
From: Forkoz <59298527+Ph0rk0z@users.noreply.github.com>
Date: Thu, 13 Jun 2024 03:34:54 +0000
Subject: [PATCH 26/64] Fix flash-attn UI parameter to actually store true.
 (#6076)

---
 modules/loaders.py       | 4 ++--
 modules/ui.py            | 2 +-
 modules/ui_model_menu.py | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/modules/loaders.py b/modules/loaders.py
index cd9d0f88..5099ffb0 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -46,7 +46,7 @@ loaders_and_params = OrderedDict({
         'no_offload_kqv',
         'row_split',
         'tensorcores',
-        'flash-attn',
+        'flash_attn',
         'streaming_llm',
         'attention_sink_size',
     ],
@@ -72,7 +72,7 @@ loaders_and_params = OrderedDict({
         'no_offload_kqv',
         'row_split',
         'tensorcores',
-        'flash-attn',
+        'flash_attn',
         'streaming_llm',
         'attention_sink_size',
         'llamacpp_HF_info',
diff --git a/modules/ui.py b/modules/ui.py
index a202d325..f88c0a82 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -102,7 +102,7 @@ def list_model_elements():
         'no_offload_kqv',
         'row_split',
         'tensorcores',
-        'flash-attn',
+        'flash_attn',
         'streaming_llm',
         'attention_sink_size',
         'hqq_backend',
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index d240bb6a..d8b53b11 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -115,7 +115,7 @@ def create_ui():
                             shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
                             shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant)
                             shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.')
-                            shared.gradio['flash-attn'] = gr.Checkbox(label="flash-attn", value=shared.args.flash_attn, info='Use flash-attention.')
+                            shared.gradio['flash_attn'] = gr.Checkbox(label="flash_attn", value=shared.args.flash_attn, info='Use flash-attention.')
                             shared.gradio['auto_devices'] = gr.Checkbox(label="auto-devices", value=shared.args.auto_devices)
                             shared.gradio['tensorcores'] = gr.Checkbox(label="tensorcores", value=shared.args.tensorcores, info='NVIDIA only: use llama-cpp-python compiled with tensor cores support. This increases performance on RTX cards.')
                             shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming_llm", value=shared.args.streaming_llm, info='(experimental) Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')

From 386500aa37eac7d5287d1738984bcef712f002a3 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 12 Jun 2024 20:52:42 -0700
Subject: [PATCH 27/64] Avoid unnecessary calls UI -> backend, to make it
 faster

---
 modules/ui_chat.py     | 30 +++++++++++++++---------------
 modules/ui_default.py  |  6 +++---
 modules/ui_notebook.py |  6 +++---
 modules/ui_session.py  |  4 ++--
 server.py              |  6 +++---
 5 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 3193bd67..203e55f9 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -181,7 +181,7 @@ def create_event_handlers():
         chat.generate_chat_reply_wrapper, gradio(inputs), gradio('display', 'history'), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None).then(
-        lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
+        None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['textbox'].submit(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
@@ -189,28 +189,28 @@ def create_event_handlers():
         chat.generate_chat_reply_wrapper, gradio(inputs), gradio('display', 'history'), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None).then(
-        lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
+        None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['Regenerate'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         partial(chat.generate_chat_reply_wrapper, regenerate=True), gradio(inputs), gradio('display', 'history'), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None).then(
-        lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
+        None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['Continue'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         partial(chat.generate_chat_reply_wrapper, _continue=True), gradio(inputs), gradio('display', 'history'), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None).then(
-        lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
+        None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['Impersonate'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         lambda x: x, gradio('textbox'), gradio('Chat input'), show_progress=False).then(
         chat.impersonate_wrapper, gradio(inputs), gradio('textbox', 'display'), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
+        None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['Replace last reply'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
@@ -288,7 +288,7 @@ def create_event_handlers():
         chat.redraw_html, gradio(reload_arr), gradio('display')).then(
         lambda x: gr.update(choices=(histories := chat.find_all_histories(x)), value=histories[0]), gradio('interface_state'), gradio('unique_id')).then(
         chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None).then(
-        lambda: None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_chat()}}')
+        None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_chat()}}')
 
     shared.gradio['character_menu'].change(
         chat.load_character, gradio('character_menu', 'name1', 'name2'), gradio('name1', 'name2', 'character_picture', 'greeting', 'context')).success(
@@ -296,7 +296,7 @@ def create_event_handlers():
         chat.load_latest_history, gradio('interface_state'), gradio('history')).then(
         chat.redraw_html, gradio(reload_arr), gradio('display')).then(
         lambda x: gr.update(choices=(histories := chat.find_all_histories(x)), value=histories[0]), gradio('interface_state'), gradio('unique_id')).then(
-        lambda: None, None, None, js=f'() => {{{ui.update_big_picture_js}; updateBigPicture()}}')
+        None, None, None, js=f'() => {{{ui.update_big_picture_js}; updateBigPicture()}}')
 
     shared.gradio['mode'].change(
         lambda x: [gr.update(visible=x != 'instruct'), gr.update(visible=x == 'chat-instruct')], gradio('mode'), gradio('chat_style', 'chat-instruct_command'), show_progress=False).then(
@@ -336,11 +336,11 @@ def create_event_handlers():
 
     shared.gradio['Submit character'].click(
         chat.upload_character, gradio('upload_json', 'upload_img_bot'), gradio('character_menu')).then(
-        lambda: None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_character()}}')
+        None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_character()}}')
 
     shared.gradio['Submit tavern character'].click(
         chat.upload_tavern_character, gradio('upload_img_tavern', 'tavern_json'), gradio('character_menu')).then(
-        lambda: None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_character()}}')
+        None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_character()}}')
 
     shared.gradio['upload_json'].upload(lambda: gr.update(interactive=True), None, gradio('Submit character'))
     shared.gradio['upload_json'].clear(lambda: gr.update(interactive=False), None, gradio('Submit character'))
@@ -354,28 +354,28 @@ def create_event_handlers():
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         lambda x: x.update({'mode': 'instruct', 'history': {'internal': [], 'visible': []}}), gradio('interface_state'), None).then(
         partial(chat.generate_chat_prompt, 'Input'), gradio('interface_state'), gradio('textbox-default')).then(
-        lambda: None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_default()}}')
+        None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_default()}}')
 
     shared.gradio['send_instruction_to_notebook'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         lambda x: x.update({'mode': 'instruct', 'history': {'internal': [], 'visible': []}}), gradio('interface_state'), None).then(
         partial(chat.generate_chat_prompt, 'Input'), gradio('interface_state'), gradio('textbox-notebook')).then(
-        lambda: None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_notebook()}}')
+        None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_notebook()}}')
 
     shared.gradio['send_instruction_to_negative_prompt'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         lambda x: x.update({'mode': 'instruct', 'history': {'internal': [], 'visible': []}}), gradio('interface_state'), None).then(
         partial(chat.generate_chat_prompt, 'Input'), gradio('interface_state'), gradio('negative_prompt')).then(
-        lambda: None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_generation_parameters()}}')
+        None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_generation_parameters()}}')
 
     shared.gradio['send-chat-to-default'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         partial(chat.generate_chat_prompt, '', _continue=True), gradio('interface_state'), gradio('textbox-default')).then(
-        lambda: None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_default()}}')
+        None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_default()}}')
 
     shared.gradio['send-chat-to-notebook'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         partial(chat.generate_chat_prompt, '', _continue=True), gradio('interface_state'), gradio('textbox-notebook')).then(
-        lambda: None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_notebook()}}')
+        None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_notebook()}}')
 
-    shared.gradio['show_controls'].change(lambda x: None, gradio('show_controls'), None, js=f'(x) => {{{ui.show_controls_js}; toggle_controls(x)}}')
+    shared.gradio['show_controls'].change(None, gradio('show_controls'), None, js=f'(x) => {{{ui.show_controls_js}; toggle_controls(x)}}')
diff --git a/modules/ui_default.py b/modules/ui_default.py
index 1f962551..bf9800f6 100644
--- a/modules/ui_default.py
+++ b/modules/ui_default.py
@@ -67,21 +67,21 @@ def create_event_handlers():
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
+        None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['textbox-default'].submit(
         lambda x: x, gradio('textbox-default'), gradio('last_input-default')).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
+        None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['markdown_render-default'].click(lambda x: x, gradio('output_textbox'), gradio('markdown-default'), queue=False)
     shared.gradio['Continue-default'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         generate_reply_wrapper, [shared.gradio['output_textbox']] + gradio(inputs)[1:], gradio(outputs), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
+        None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['Stop-default'].click(stop_everything_event, None, None, queue=False)
     shared.gradio['prompt_menu-default'].change(load_prompt, gradio('prompt_menu-default'), gradio('textbox-default'), show_progress=False)
diff --git a/modules/ui_notebook.py b/modules/ui_notebook.py
index a7c62baf..307bc0f3 100644
--- a/modules/ui_notebook.py
+++ b/modules/ui_notebook.py
@@ -67,14 +67,14 @@ def create_event_handlers():
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
+        None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['textbox-notebook'].submit(
         lambda x: x, gradio('textbox-notebook'), gradio('last_input-notebook')).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
+        None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['Undo'].click(lambda x: x, gradio('last_input-notebook'), gradio('textbox-notebook'), show_progress=False)
     shared.gradio['markdown_render-notebook'].click(lambda x: x, gradio('textbox-notebook'), gradio('markdown-notebook'), queue=False)
@@ -83,7 +83,7 @@ def create_event_handlers():
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
+        None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['Stop-notebook'].click(stop_everything_event, None, None, queue=False)
     shared.gradio['prompt_menu-notebook'].change(load_prompt, gradio('prompt_menu-notebook'), gradio('textbox-notebook'), show_progress=False)
diff --git a/modules/ui_session.py b/modules/ui_session.py
index 08929c33..087091ce 100644
--- a/modules/ui_session.py
+++ b/modules/ui_session.py
@@ -32,10 +32,10 @@ def create_ui():
         # Reset interface event
         shared.gradio['reset_interface'].click(
             set_interface_arguments, gradio('extensions_menu', 'bool_menu'), None).then(
-            lambda: None, None, None, js='() => {document.body.innerHTML=\'<h1 style="font-family:monospace;padding-top:20%;margin:0;height:100vh;color:lightgray;text-align:center;background:var(--body-background-fill)">Reloading...</h1>\'; setTimeout(function(){location.reload()},2500); return []}')
+            None, None, None, js='() => {document.body.innerHTML=\'<h1 style="font-family:monospace;padding-top:20%;margin:0;height:100vh;color:lightgray;text-align:center;background:var(--body-background-fill)">Reloading...</h1>\'; setTimeout(function(){location.reload()},2500); return []}')
 
         shared.gradio['toggle_dark_mode'].click(
-            lambda: None, None, None, js='() => {document.getElementsByTagName("body")[0].classList.toggle("dark")}').then(
+            None, None, None, js='() => {document.getElementsByTagName("body")[0].classList.toggle("dark")}').then(
             lambda x: 'dark' if x == 'light' else 'light', gradio('theme_state'), gradio('theme_state'))
 
         shared.gradio['save_settings'].click(
diff --git a/server.py b/server.py
index a8237ba6..269e82b3 100644
--- a/server.py
+++ b/server.py
@@ -146,9 +146,9 @@ def create_interface():
         ui_model_menu.create_event_handlers()
 
         # Interface launch events
-        shared.gradio['interface'].load(lambda: None, None, None, js=f"() => {{if ({str(shared.settings['dark_theme']).lower()}) {{ document.getElementsByTagName('body')[0].classList.add('dark'); }} }}")
-        shared.gradio['interface'].load(lambda: None, None, None, js=f"() => {{{js}}}")
-        shared.gradio['interface'].load(lambda x: None, gradio('show_controls'), None, js=f'(x) => {{{ui.show_controls_js}; toggle_controls(x)}}')
+        shared.gradio['interface'].load(None, None, None, js=f"() => {{if ({str(shared.settings['dark_theme']).lower()}) {{ document.getElementsByTagName('body')[0].classList.add('dark'); }} }}")
+        shared.gradio['interface'].load(None, None, None, js=f"() => {{{js}}}")
+        shared.gradio['interface'].load(None, gradio('show_controls'), None, js=f'(x) => {{{ui.show_controls_js}; toggle_controls(x)}}')
         shared.gradio['interface'].load(partial(ui.apply_interface_values, {}, use_persistent=True), None, gradio(ui.list_interface_input_elements()), show_progress=False)
         shared.gradio['interface'].load(chat.redraw_html, gradio(ui_chat.reload_arr), gradio('display'))
 

From 8930bfc5f487b78fe103a1b66bd81fc9d7a84a1d Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 13 Jun 2024 20:38:31 -0300
Subject: [PATCH 28/64] Bump PyTorch, ExLlamaV2, flash-attention (#6122)

---
 README.md                      | 14 +++++++-------
 one_click.py                   |  6 +++---
 requirements.txt               | 18 +++++++++---------
 requirements_amd.txt           |  6 +++---
 requirements_amd_noavx2.txt    |  6 +++---
 requirements_apple_intel.txt   |  2 +-
 requirements_apple_silicon.txt |  2 +-
 requirements_noavx2.txt        | 18 +++++++++---------
 8 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/README.md b/README.md
index a699a9ab..9655ea79 100644
--- a/README.md
+++ b/README.md
@@ -76,12 +76,12 @@ conda activate textgen
 
 | System | GPU | Command |
 |--------|---------|---------|
-| Linux/WSL | NVIDIA | `pip3 install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cu121` |
-| Linux/WSL | CPU only | `pip3 install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cpu` |
-| Linux | AMD | `pip3 install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/rocm5.6` |
-| MacOS + MPS | Any | `pip3 install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1` |
-| Windows | NVIDIA | `pip3 install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cu121` |
-| Windows | CPU only | `pip3 install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1` |
+| Linux/WSL | NVIDIA | `pip3 install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu121` |
+| Linux/WSL | CPU only | `pip3 install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cpu` |
+| Linux | AMD | `pip3 install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/rocm5.6` |
+| MacOS + MPS | Any | `pip3 install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1` |
+| Windows | NVIDIA | `pip3 install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu121` |
+| Windows | CPU only | `pip3 install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1` |
 
 The up-to-date commands can be found here: https://pytorch.org/get-started/locally/.
 
@@ -146,7 +146,7 @@ Then browse to
 1) For Kepler GPUs and older, you will need to install CUDA 11.8 instead of 12:
 
 ```
-pip3 install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cu118
+pip3 install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu118
 conda install -y -c "nvidia/label/cuda-11.8.0" cuda-runtime
 ```
 
diff --git a/one_click.py b/one_click.py
index 55a4a3db..187b3c89 100644
--- a/one_click.py
+++ b/one_click.py
@@ -16,9 +16,9 @@ import sys
 
 
 # Define the required PyTorch version
-TORCH_VERSION = "2.2.1"
-TORCHVISION_VERSION = "0.17.1"
-TORCHAUDIO_VERSION = "2.2.1"
+TORCH_VERSION = "2.3.1"
+TORCHVISION_VERSION = "0.18.1"
+TORCHAUDIO_VERSION = "2.3.1"
 
 # Environment
 script_dir = os.getcwd()
diff --git a/requirements.txt b/requirements.txt
index c1c63504..012df8d2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -53,13 +53,13 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/te
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.75+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # CUDA wheels
-https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
-https://github.com/oobabooga/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2.0cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.3.1-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.3.1-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.3.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.3.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
+https://github.com/oobabooga/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.3.1cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.3.1cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.3cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 autoawq==0.2.5; platform_system == "Linux" or platform_system == "Windows"
diff --git a/requirements_amd.txt b/requirements_amd.txt
index 8489a97c..01c50dd7 100644
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@@ -40,8 +40,8 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cp
 # AMD wheels
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.75+rocm5.6.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.75+rocm5.6.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
+https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+rocm5.6.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+rocm5.6.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
 https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.5/autoawq-0.2.5+rocm561-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.5/autoawq-0.2.5+rocm561-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt
index 68a82f40..937fdbae 100644
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@@ -38,8 +38,8 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cp
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # AMD wheels
-https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
+https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+rocm5.6.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+rocm5.6.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
 https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.5/autoawq-0.2.5+rocm561-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.5/autoawq-0.2.5+rocm561-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt
index d3cecd08..75973459 100644
--- a/requirements_apple_intel.txt
+++ b/requirements_apple_intel.txt
@@ -38,4 +38,4 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/me
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp310-cp310-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20-py3-none-any.whl
+https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5-py3-none-any.whl
diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt
index c940cc52..542ab52f 100644
--- a/requirements_apple_silicon.txt
+++ b/requirements_apple_silicon.txt
@@ -40,4 +40,4 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/me
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10"
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20-py3-none-any.whl
+https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5-py3-none-any.whl
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index c110be62..376ba0fd 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -53,13 +53,13 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/te
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.75+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # CUDA wheels
-https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
-https://github.com/oobabooga/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2.0cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.3.1-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.3.1-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.3.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.3.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
+https://github.com/oobabooga/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.3.1cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.3.1cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.3cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 autoawq==0.2.5; platform_system == "Linux" or platform_system == "Windows"

From 9aef01551df699e71d0489dcf7eaecd34e2ec11f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 13 Jun 2024 17:53:07 -0700
Subject: [PATCH 29/64] Revert "Use reentrant generation lock (#6107)"

This reverts commit b675151f25b2814a56aa567fa7f68b28f6952872.
---
 server.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/server.py b/server.py
index 269e82b3..7afa954e 100644
--- a/server.py
+++ b/server.py
@@ -32,7 +32,7 @@ import sys
 import time
 from functools import partial
 from pathlib import Path
-from threading import RLock, Thread
+from threading import Lock, Thread
 
 import yaml
 
@@ -243,7 +243,7 @@ if __name__ == "__main__":
         if shared.args.lora:
             add_lora_to_model(shared.args.lora)
 
-    shared.generation_lock = RLock()
+    shared.generation_lock = Lock()
 
     if shared.args.idle_timeout > 0:
         timer_thread = Thread(target=unload_model_if_idle)

From 0f3a423de11ce4196d840ece18d39ecba6f603fa Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 13 Jun 2024 19:33:15 -0700
Subject: [PATCH 30/64] Alternative solution to "get next logits" deadlock
 (#6106)

---
 modules/logits.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/modules/logits.py b/modules/logits.py
index 1fe2e73e..447deb24 100644
--- a/modules/logits.py
+++ b/modules/logits.py
@@ -16,19 +16,24 @@ def get_next_logits(*args, **kwargs):
     if shared.args.idle_timeout > 0 and shared.model is None and shared.previous_model_name not in [None, 'None']:
         shared.model, shared.tokenizer = load_model(shared.previous_model_name)
 
-    shared.generation_lock.acquire()
+    needs_lock = kwargs.get('use_samplers', False)
+    if needs_lock:
+        shared.generation_lock.acquire()
+
     try:
         result = _get_next_logits(*args, **kwargs)
     except Exception:
         traceback.print_exc()
         result = None
 
-    models.last_generation_time = time.time()
-    shared.generation_lock.release()
+    if needs_lock:
+        models.last_generation_time = time.time()
+        shared.generation_lock.release()
+
     return result
 
 
-def _get_next_logits(prompt, state, use_samplers, previous, top_logits=25, return_dict=False):
+def _get_next_logits(prompt, state, use_samplers=False, previous="", top_logits=25, return_dict=False):
     if shared.model is None:
         logger.error("No model is loaded! Select one in the Model tab.")
         return 'Error: No model is loaded1 Select one in the Model tab.', previous

From 10601850d92c500f8e63ade4f622d20da4f62c78 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 13 Jun 2024 19:54:12 -0700
Subject: [PATCH 31/64] Fix after previous commit

---
 modules/logits.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/logits.py b/modules/logits.py
index 447deb24..4233c8a5 100644
--- a/modules/logits.py
+++ b/modules/logits.py
@@ -16,7 +16,7 @@ def get_next_logits(*args, **kwargs):
     if shared.args.idle_timeout > 0 and shared.model is None and shared.previous_model_name not in [None, 'None']:
         shared.model, shared.tokenizer = load_model(shared.previous_model_name)
 
-    needs_lock = kwargs.get('use_samplers', False)
+    needs_lock = not args[2]  # use_samplers
     if needs_lock:
         shared.generation_lock.acquire()
 
@@ -33,7 +33,7 @@ def get_next_logits(*args, **kwargs):
     return result
 
 
-def _get_next_logits(prompt, state, use_samplers=False, previous="", top_logits=25, return_dict=False):
+def _get_next_logits(prompt, state, use_samplers, previous, top_logits=25, return_dict=False):
     if shared.model is None:
         logger.error("No model is loaded! Select one in the Model tab.")
         return 'Error: No model is loaded1 Select one in the Model tab.', previous

From fdd8fab9cfe1ffcc7db97bd3e7d9bac31f05a6c2 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 14 Jun 2024 13:46:35 -0300
Subject: [PATCH 32/64] Bump hqq from 0.1.7.post2 to 0.1.7.post3 (#6090)

---
 requirements.txt                 | 2 +-
 requirements_amd.txt             | 2 +-
 requirements_amd_noavx2.txt      | 2 +-
 requirements_apple_intel.txt     | 2 +-
 requirements_apple_silicon.txt   | 2 +-
 requirements_cpu_only.txt        | 2 +-
 requirements_cpu_only_noavx2.txt | 2 +-
 requirements_noavx2.txt          | 2 +-
 requirements_nowheels.txt        | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 012df8d2..90d2dffb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,7 +6,7 @@ colorama
 datasets
 einops
 gradio==4.26.*
-hqq==0.1.7.post2
+hqq==0.1.7.post3
 jinja2==3.1.2
 lm_eval==0.3.0
 markdown
diff --git a/requirements_amd.txt b/requirements_amd.txt
index 01c50dd7..2b350a4e 100644
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@@ -3,7 +3,7 @@ colorama
 datasets
 einops
 gradio==4.26.*
-hqq==0.1.7.post2
+hqq==0.1.7.post3
 jinja2==3.1.2
 lm_eval==0.3.0
 markdown
diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt
index 937fdbae..770dd74a 100644
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@@ -3,7 +3,7 @@ colorama
 datasets
 einops
 gradio==4.26.*
-hqq==0.1.7.post2
+hqq==0.1.7.post3
 jinja2==3.1.2
 lm_eval==0.3.0
 markdown
diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt
index 75973459..631158e8 100644
--- a/requirements_apple_intel.txt
+++ b/requirements_apple_intel.txt
@@ -3,7 +3,7 @@ colorama
 datasets
 einops
 gradio==4.26.*
-hqq==0.1.7.post2
+hqq==0.1.7.post3
 jinja2==3.1.2
 lm_eval==0.3.0
 markdown
diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt
index 542ab52f..2857eb03 100644
--- a/requirements_apple_silicon.txt
+++ b/requirements_apple_silicon.txt
@@ -3,7 +3,7 @@ colorama
 datasets
 einops
 gradio==4.26.*
-hqq==0.1.7.post2
+hqq==0.1.7.post3
 jinja2==3.1.2
 lm_eval==0.3.0
 markdown
diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt
index 77420e6f..b0634fe5 100644
--- a/requirements_cpu_only.txt
+++ b/requirements_cpu_only.txt
@@ -3,7 +3,7 @@ colorama
 datasets
 einops
 gradio==4.26.*
-hqq==0.1.7.post2
+hqq==0.1.7.post3
 jinja2==3.1.2
 lm_eval==0.3.0
 markdown
diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt
index 56bdf547..a40a68f0 100644
--- a/requirements_cpu_only_noavx2.txt
+++ b/requirements_cpu_only_noavx2.txt
@@ -3,7 +3,7 @@ colorama
 datasets
 einops
 gradio==4.26.*
-hqq==0.1.7.post2
+hqq==0.1.7.post3
 jinja2==3.1.2
 lm_eval==0.3.0
 markdown
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index 376ba0fd..8dd50c1e 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -6,7 +6,7 @@ colorama
 datasets
 einops
 gradio==4.26.*
-hqq==0.1.7.post2
+hqq==0.1.7.post3
 jinja2==3.1.2
 lm_eval==0.3.0
 markdown
diff --git a/requirements_nowheels.txt b/requirements_nowheels.txt
index 821049bc..e96d468f 100644
--- a/requirements_nowheels.txt
+++ b/requirements_nowheels.txt
@@ -3,7 +3,7 @@ colorama
 datasets
 einops
 gradio==4.26.*
-hqq==0.1.7.post2
+hqq==0.1.7.post3
 jinja2==3.1.2
 lm_eval==0.3.0
 markdown

From 1576227f169f6b5dc7676348b75a3cc472fe2ba2 Mon Sep 17 00:00:00 2001
From: Forkoz <59298527+Ph0rk0z@users.noreply.github.com>
Date: Fri, 14 Jun 2024 11:51:01 -0500
Subject: [PATCH 33/64] Fix GGUFs with no BOS token present, mainly qwen2
 models. (#6119)

---------

Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com>
---
 modules/models_settings.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/modules/models_settings.py b/modules/models_settings.py
index c3712db2..e9645c96 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -64,7 +64,11 @@ def get_model_metadata(model):
         if 'tokenizer.chat_template' in metadata:
             template = metadata['tokenizer.chat_template']
             eos_token = metadata['tokenizer.ggml.tokens'][metadata['tokenizer.ggml.eos_token_id']]
-            bos_token = metadata['tokenizer.ggml.tokens'][metadata['tokenizer.ggml.bos_token_id']]
+            if 'tokenizer.ggml.bos_token_id' in metadata:
+                bos_token = metadata['tokenizer.ggml.tokens'][metadata['tokenizer.ggml.bos_token_id']]
+            else:
+                bos_token = ""
+
             template = template.replace('eos_token', "'{}'".format(eos_token))
             template = template.replace('bos_token', "'{}'".format(bos_token))
 

From 9420973b6224fe76fbef83659184a810436a8396 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 14 Jun 2024 16:42:03 -0300
Subject: [PATCH 34/64] Downgrade PyTorch to 2.2.2 (#6124)

---
 README.md                      | 14 +++++++-------
 one_click.py                   |  6 +++---
 requirements.txt               | 18 +++++++++---------
 requirements_amd.txt           |  6 +++---
 requirements_amd_noavx2.txt    |  6 +++---
 requirements_apple_intel.txt   |  2 +-
 requirements_apple_silicon.txt |  2 +-
 requirements_noavx2.txt        | 18 +++++++++---------
 8 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/README.md b/README.md
index 9655ea79..52605f5e 100644
--- a/README.md
+++ b/README.md
@@ -76,12 +76,12 @@ conda activate textgen
 
 | System | GPU | Command |
 |--------|---------|---------|
-| Linux/WSL | NVIDIA | `pip3 install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu121` |
-| Linux/WSL | CPU only | `pip3 install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cpu` |
-| Linux | AMD | `pip3 install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/rocm5.6` |
-| MacOS + MPS | Any | `pip3 install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1` |
-| Windows | NVIDIA | `pip3 install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu121` |
-| Windows | CPU only | `pip3 install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1` |
+| Linux/WSL | NVIDIA | `pip3 install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121` |
+| Linux/WSL | CPU only | `pip3 install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cpu` |
+| Linux | AMD | `pip3 install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/rocm5.6` |
+| MacOS + MPS | Any | `pip3 install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2` |
+| Windows | NVIDIA | `pip3 install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121` |
+| Windows | CPU only | `pip3 install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2` |
 
 The up-to-date commands can be found here: https://pytorch.org/get-started/locally/.
 
@@ -146,7 +146,7 @@ Then browse to
 1) For Kepler GPUs and older, you will need to install CUDA 11.8 instead of 12:
 
 ```
-pip3 install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu118
+pip3 install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu118
 conda install -y -c "nvidia/label/cuda-11.8.0" cuda-runtime
 ```
 
diff --git a/one_click.py b/one_click.py
index 187b3c89..bc30feb3 100644
--- a/one_click.py
+++ b/one_click.py
@@ -16,9 +16,9 @@ import sys
 
 
 # Define the required PyTorch version
-TORCH_VERSION = "2.3.1"
-TORCHVISION_VERSION = "0.18.1"
-TORCHAUDIO_VERSION = "2.3.1"
+TORCH_VERSION = "2.2.2"
+TORCHVISION_VERSION = "0.17.2"
+TORCHAUDIO_VERSION = "2.2.2"
 
 # Environment
 script_dir = os.getcwd()
diff --git a/requirements.txt b/requirements.txt
index 90d2dffb..01510dea 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -53,13 +53,13 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/te
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.75+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # CUDA wheels
-https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.3.1-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.3.1-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.3.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.3.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
-https://github.com/oobabooga/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.3.1cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.3.1cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.3cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.2.2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.2.2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
+https://github.com/oobabooga/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.2.2cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.2.2cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 autoawq==0.2.5; platform_system == "Linux" or platform_system == "Windows"
diff --git a/requirements_amd.txt b/requirements_amd.txt
index 2b350a4e..8e7b24cd 100644
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@@ -40,8 +40,8 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cp
 # AMD wheels
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.75+rocm5.6.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.75+rocm5.6.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+rocm5.6.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+rocm5.6.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+rocm5.6.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+rocm5.6.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
 https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.5/autoawq-0.2.5+rocm561-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.5/autoawq-0.2.5+rocm561-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt
index 770dd74a..65122f09 100644
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@@ -38,8 +38,8 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cp
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # AMD wheels
-https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+rocm5.6.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+rocm5.6.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+rocm5.6.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+rocm5.6.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
 https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.5/autoawq-0.2.5+rocm561-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.5/autoawq-0.2.5+rocm561-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt
index 631158e8..7fc9202b 100644
--- a/requirements_apple_intel.txt
+++ b/requirements_apple_intel.txt
@@ -38,4 +38,4 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/me
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp310-cp310-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
-https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5-py3-none-any.whl
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5-py3-none-any.whl
diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt
index 2857eb03..2f45cff5 100644
--- a/requirements_apple_silicon.txt
+++ b/requirements_apple_silicon.txt
@@ -40,4 +40,4 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/me
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10"
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
-https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5-py3-none-any.whl
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5-py3-none-any.whl
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index 8dd50c1e..4ddffe67 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -53,13 +53,13 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/te
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.75+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # CUDA wheels
-https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.3.1-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.3.1-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.3.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.3.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
-https://github.com/oobabooga/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.3.1cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.3.1cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.3cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.2.2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.2.2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
+https://github.com/oobabooga/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.2.2cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.2.2cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 autoawq==0.2.5; platform_system == "Linux" or platform_system == "Windows"

From b6eaf7923e5d0ce1110f65b8003732f005754fa9 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 14 Jun 2024 21:22:09 -0700
Subject: [PATCH 35/64] Bump llama-cpp-python to 0.2.78

---
 requirements.txt                 | 24 ++++++++++++------------
 requirements_amd.txt             | 12 ++++++------
 requirements_amd_noavx2.txt      |  8 ++++----
 requirements_apple_intel.txt     | 12 ++++++------
 requirements_apple_silicon.txt   | 16 ++++++++--------
 requirements_cpu_only.txt        |  8 ++++----
 requirements_cpu_only_noavx2.txt |  8 ++++----
 requirements_noavx2.txt          | 24 ++++++++++++------------
 8 files changed, 56 insertions(+), 56 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 01510dea..7f1cc37a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -35,22 +35,22 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, no tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.75+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.75+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.75+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.75+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.78+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.78+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.78+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.78+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.75+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.75+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.75+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.75+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.78+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.78+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.78+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.78+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # CUDA wheels
 https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.2.2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements_amd.txt b/requirements_amd.txt
index 8e7b24cd..840f209f 100644
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@@ -32,14 +32,14 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.75+rocm5.6.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.75+rocm5.6.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.78+rocm5.6.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.78+rocm5.6.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+rocm5.6.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+rocm5.6.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt
index 65122f09..30b2ad14 100644
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@@ -32,10 +32,10 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # AMD wheels
 https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+rocm5.6.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt
index 7fc9202b..13a58b25 100644
--- a/requirements_apple_intel.txt
+++ b/requirements_apple_intel.txt
@@ -32,10 +32,10 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp311-cp311-macosx_11_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp310-cp310-macosx_11_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp311-cp311-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp310-cp310-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.78-cp311-cp311-macosx_11_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.78-cp310-cp310-macosx_11_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.78-cp311-cp311-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.78-cp310-cp310-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.78-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.78-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5-py3-none-any.whl
diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt
index 2f45cff5..999a773f 100644
--- a/requirements_apple_silicon.txt
+++ b/requirements_apple_silicon.txt
@@ -32,12 +32,12 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp311-cp311-macosx_11_0_arm64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp310-cp310-macosx_11_0_arm64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp311-cp311-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp310-cp310-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.78-cp311-cp311-macosx_11_0_arm64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.78-cp310-cp310-macosx_11_0_arm64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.78-cp311-cp311-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.78-cp310-cp310-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.78-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.78-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.78-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.78-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5-py3-none-any.whl
diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt
index b0634fe5..65886c90 100644
--- a/requirements_cpu_only.txt
+++ b/requirements_cpu_only.txt
@@ -32,7 +32,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt
index a40a68f0..2938d54a 100644
--- a/requirements_cpu_only_noavx2.txt
+++ b/requirements_cpu_only_noavx2.txt
@@ -32,7 +32,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index 4ddffe67..caf42187 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -35,22 +35,22 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, no tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.75+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.75+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.75+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.75+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.78+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.78+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.78+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.78+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.75+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.75+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.75+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.75+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.78+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.78+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.78+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.78+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # CUDA wheels
 https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.2.2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"

From fd7c3c5bb0fe20fdde1e72ce882667ac4dc94591 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 15 Jun 2024 06:38:05 -0700
Subject: [PATCH 36/64] Don't git pull on installation (to make past releases
 installable)

---
 one_click.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/one_click.py b/one_click.py
index bc30feb3..e94b6d44 100644
--- a/one_click.py
+++ b/one_click.py
@@ -315,7 +315,7 @@ def install_webui():
         run_cmd("conda install -y libuv")
 
     # Install the webui requirements
-    update_requirements(initial_installation=True)
+    update_requirements(initial_installation=True, pull=False)
 
 
 def get_extensions_names():

From 229d89ccfb862ad3899c87acf5b103a0b88d25c2 Mon Sep 17 00:00:00 2001
From: Guanghua Lu <102669562+Touch-Night@users.noreply.github.com>
Date: Sun, 16 Jun 2024 10:00:13 +0800
Subject: [PATCH 37/64] Make logs more readable, no more \u7f16\u7801 (#6127)

---
 modules/chat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/chat.py b/modules/chat.py
index 43f5466b..5da6ad3b 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -492,7 +492,7 @@ def save_history(history, unique_id, character, mode):
         p.parent.mkdir(parents=True)
 
     with open(p, 'w', encoding='utf-8') as f:
-        f.write(json.dumps(history, indent=4))
+        f.write(json.dumps(history, indent=4, ensure_ascii=False))
 
 
 def rename_history(old_id, new_id, character, mode):

From b10d735176c42042acb1ecafe580e83ed71afe83 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 19 Jun 2024 17:40:33 -0700
Subject: [PATCH 38/64] Minor CSS linting

---
 css/html_instruct_style.css | 4 ++--
 css/main.css                | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/css/html_instruct_style.css b/css/html_instruct_style.css
index 8891d180..bdf68aad 100644
--- a/css/html_instruct_style.css
+++ b/css/html_instruct_style.css
@@ -62,8 +62,8 @@
 
 .gradio-container .chat .user-message {
     padding: 20px;
-    padding-left: 0px;
-    padding-right: 0px;
+    padding-left: 0;
+    padding-right: 0;
     background-color: transparent;
     border-radius: 8px;
     border-bottom-right-radius: 0;
diff --git a/css/main.css b/css/main.css
index a1ff2a36..0c149d2c 100644
--- a/css/main.css
+++ b/css/main.css
@@ -96,7 +96,7 @@ gradio-app > :first-child {
 
 .header_bar {
     background-color: #f7f7f7;
-    box-shadow: 0 0px 3px rgba(22 22 22 / 35%);
+    box-shadow: 0 0 3px rgba(22 22 22 / 35%);
     margin-bottom: 0;
     overflow-x: scroll;
     margin-left: calc(-1 * var(--size-4));
@@ -433,12 +433,12 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 .message-body code {
     white-space: pre-wrap !important;
     word-wrap: break-word !important;
-    border: 1px solid #666666;
+    border: 1px solid #666;
     border-radius: 5px;
     font-size: 82%;
     padding: 1px 3px;
     background: #0d1117 !important;
-    color: rgb(201, 209, 217);
+    color: rgb(201 209 217);
 }
 
 .message-body pre > code {

From 2c5a9eb597fd4332c81d493fa231b73593076d95 Mon Sep 17 00:00:00 2001
From: GodEmperor785 <32663080+GodEmperor785@users.noreply.github.com>
Date: Thu, 20 Jun 2024 02:42:17 +0200
Subject: [PATCH 39/64] Change limits of RoPE scaling sliders in UI (#6142)

---
 modules/ui_model_menu.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index d8b53b11..f8141bd1 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -105,8 +105,8 @@ def create_ui():
                             shared.gradio['max_seq_len'] = gr.Slider(label='max_seq_len', minimum=0, maximum=shared.settings['truncation_length_max'], step=256, info='Context length. Try lowering this if you run out of memory while loading the model.', value=shared.args.max_seq_len)
                             with gr.Blocks():
                                 shared.gradio['alpha_value'] = gr.Slider(label='alpha_value', minimum=1, maximum=8, step=0.05, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.', value=shared.args.alpha_value)
-                                shared.gradio['rope_freq_base'] = gr.Slider(label='rope_freq_base', minimum=0, maximum=1000000, step=1000, info='If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63)', value=shared.args.rope_freq_base)
-                                shared.gradio['compress_pos_emb'] = gr.Slider(label='compress_pos_emb', minimum=1, maximum=8, step=1, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.', value=shared.args.compress_pos_emb)
+                                shared.gradio['rope_freq_base'] = gr.Slider(label='rope_freq_base', minimum=0, maximum=20000000, step=1000, info='If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63)', value=shared.args.rope_freq_base)
+                                shared.gradio['compress_pos_emb'] = gr.Slider(label='compress_pos_emb', minimum=1, maximum=8, step=0.1, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.', value=shared.args.compress_pos_emb)
 
                             shared.gradio['autogptq_info'] = gr.Markdown('ExLlamav2_HF is recommended over AutoGPTQ for models derived from Llama.')
 

From 5993904acf391d2a98ed399ba31d93fb0dbec296 Mon Sep 17 00:00:00 2001
From: CharlesCNorton <135471798+CharlesCNorton@users.noreply.github.com>
Date: Sat, 22 Jun 2024 20:40:25 -0400
Subject: [PATCH 40/64] Fix several typos in the codebase (#6151)

---
 docs/02 - Default and Notebook Tabs.md | 4 ++--
 docs/12 - OpenAI API.md                | 2 +-
 docs/README.md                         | 2 +-
 modules/LoRA.py                        | 2 +-
 modules/github.py                      | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/02 - Default and Notebook Tabs.md b/docs/02 - Default and Notebook Tabs.md
index c450635e..4bb78448 100644
--- a/docs/02 - Default and Notebook Tabs.md	
+++ b/docs/02 - Default and Notebook Tabs.md	
@@ -18,13 +18,13 @@ In the **Prompt** menu, you can select from some predefined prompts defined unde
 
 ### Output
 
-Four tabs can be found:
+Five tabs can be found:
 
 * **Raw**: where the raw text generated by the model appears.
 * **Markdown**: it contains a "Render" button. You can click on it at any time to render the current output as markdown. This is particularly useful for models that generate LaTeX equations like GALACTICA.
 * **HTML**: displays the output in an HTML style that is meant to be easier to read. Its style is defined under `text-generation-webui/css/html_readable_style.css`.
 * **Logits**: when you click on "Get next token probabilities", this tab displays the 50 most likely next tokens and their probabilities based on your current input. If "Use samplers" is checked, the probabilities will be the ones after the sampling parameters in the "Parameters" > "Generation" tab are applied. Otherwise, they will be the raw probabilities generated by the model.
-* **Tokens**: allows you to tokenize your prompt and see the ID numbers for the individuals tokens.
+* **Tokens**: allows you to tokenize your prompt and see the ID numbers for the individual tokens.
 
 ## Notebook tab
 
diff --git a/docs/12 - OpenAI API.md b/docs/12 - OpenAI API.md
index fee57d33..b00a1f34 100644
--- a/docs/12 - OpenAI API.md	
+++ b/docs/12 - OpenAI API.md	
@@ -219,7 +219,7 @@ print()
 
 ### Environment variables
 
-The following environment variables can be used (they take precendence over everything else):
+The following environment variables can be used (they take precedence over everything else):
 
 | Variable Name          | Description                                                                                        | Example Value              |
 |------------------------|------------------------------------|----------------------------|
diff --git a/docs/README.md b/docs/README.md
index d2efbf1d..666ee85c 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -1,4 +1,4 @@
-These files is a mirror of the documentation at:
+These files are a mirror of the documentation at:
 
 # https://github.com/oobabooga/text-generation-webui/wiki
 
diff --git a/modules/LoRA.py b/modules/LoRA.py
index 15132f4e..eda5e406 100644
--- a/modules/LoRA.py
+++ b/modules/LoRA.py
@@ -73,7 +73,7 @@ def add_lora_autogptq(lora_names):
         if len(lora_names) > 1:
             logger.warning('AutoGPTQ can only work with 1 LoRA at the moment. Only the first one in the list will be loaded.')
         if not shared.args.no_inject_fused_attention:
-            logger.warning('Fused Atttention + AutoGPTQ may break Lora loading. Disable it.')
+            logger.warning('Fused Attention + AutoGPTQ may break Lora loading. Disable it.')
 
         peft_config = GPTQLoraConfig(
             inference_mode=True,
diff --git a/modules/github.py b/modules/github.py
index 282267b6..f3dc26e1 100644
--- a/modules/github.py
+++ b/modules/github.py
@@ -32,7 +32,7 @@ def clone_or_pull_repository(github_url):
         yield f"Cloning {github_url}..."
         clone_output = subprocess.check_output(["git", "clone", github_url, repo_path], stderr=subprocess.STDOUT)
         new_extensions.add(repo_name)
-        yield f"The extension `{repo_name}` has been downloaded.\n\nPlease close the the web UI completely and launch it again to be able to load it."
+        yield f"The extension `{repo_name}` has been downloaded.\n\nPlease close the web UI completely and launch it again to be able to load it."
         return clone_output.decode()
     except subprocess.CalledProcessError as e:
         return str(e)

From 125bb7b03be424c3070b6c27e8e7fdbd45c21230 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 23 Jun 2024 19:54:28 -0700
Subject: [PATCH 41/64] Revert "Bump llama-cpp-python to 0.2.78"

This reverts commit b6eaf7923e5d0ce1110f65b8003732f005754fa9.
---
 requirements.txt                 | 24 ++++++++++++------------
 requirements_amd.txt             | 12 ++++++------
 requirements_amd_noavx2.txt      |  8 ++++----
 requirements_apple_intel.txt     | 12 ++++++------
 requirements_apple_silicon.txt   | 16 ++++++++--------
 requirements_cpu_only.txt        |  8 ++++----
 requirements_cpu_only_noavx2.txt |  8 ++++----
 requirements_noavx2.txt          | 24 ++++++++++++------------
 8 files changed, 56 insertions(+), 56 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 7f1cc37a..01510dea 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -35,22 +35,22 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, no tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.78+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.78+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.78+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.78+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.75+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.75+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.75+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.75+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.78+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.78+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.78+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.78+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.75+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.75+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.75+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.75+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # CUDA wheels
 https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.2.2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements_amd.txt b/requirements_amd.txt
index 840f209f..8e7b24cd 100644
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@@ -32,14 +32,14 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.78+rocm5.6.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.78+rocm5.6.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.75+rocm5.6.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.75+rocm5.6.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+rocm5.6.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+rocm5.6.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt
index 30b2ad14..65122f09 100644
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@@ -32,10 +32,10 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # AMD wheels
 https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+rocm5.6.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt
index 13a58b25..7fc9202b 100644
--- a/requirements_apple_intel.txt
+++ b/requirements_apple_intel.txt
@@ -32,10 +32,10 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.78-cp311-cp311-macosx_11_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.78-cp310-cp310-macosx_11_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.78-cp311-cp311-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.78-cp310-cp310-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.78-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.78-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp311-cp311-macosx_11_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp310-cp310-macosx_11_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp311-cp311-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp310-cp310-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5-py3-none-any.whl
diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt
index 999a773f..2f45cff5 100644
--- a/requirements_apple_silicon.txt
+++ b/requirements_apple_silicon.txt
@@ -32,12 +32,12 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.78-cp311-cp311-macosx_11_0_arm64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.78-cp310-cp310-macosx_11_0_arm64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.78-cp311-cp311-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.78-cp310-cp310-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.78-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.78-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.78-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.78-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp311-cp311-macosx_11_0_arm64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp310-cp310-macosx_11_0_arm64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp311-cp311-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp310-cp310-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5-py3-none-any.whl
diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt
index 65886c90..b0634fe5 100644
--- a/requirements_cpu_only.txt
+++ b/requirements_cpu_only.txt
@@ -32,7 +32,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt
index 2938d54a..a40a68f0 100644
--- a/requirements_cpu_only_noavx2.txt
+++ b/requirements_cpu_only_noavx2.txt
@@ -32,7 +32,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index caf42187..4ddffe67 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -35,22 +35,22 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.78+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, no tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.78+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.78+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.78+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.78+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.75+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.75+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.75+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.75+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.78+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.78+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.78+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.78+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.75+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.75+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.75+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.75+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # CUDA wheels
 https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.2.2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"

From 57119c1b300d8f7b1cc87c6aea9b1bf5a294d87f Mon Sep 17 00:00:00 2001
From: Louis Del Valle <92354925+nero-dv@users.noreply.github.com>
Date: Sun, 23 Jun 2024 23:56:51 -0500
Subject: [PATCH 42/64] Update block_requests.py to resolve unexpected type
 error (500 error) (#5976)

---
 modules/block_requests.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/modules/block_requests.py b/modules/block_requests.py
index 0eb10fa2..e95cd7e2 100644
--- a/modules/block_requests.py
+++ b/modules/block_requests.py
@@ -43,19 +43,19 @@ def my_open(*args, **kwargs):
         with original_open(*args, **kwargs) as f:
             file_contents = f.read()
 
-        file_contents = file_contents.replace(b'\t\t<script\n\t\t\tsrc="https://cdnjs.cloudflare.com/ajax/libs/iframe-resizer/4.3.9/iframeResizer.contentWindow.min.js"\n\t\t\tasync\n\t\t></script>', b'')
-        file_contents = file_contents.replace(b'cdnjs.cloudflare.com', b'127.0.0.1')
+        file_contents = file_contents.replace('\t\t<script\n\t\t\tsrc="https://cdnjs.cloudflare.com/ajax/libs/iframe-resizer/4.3.9/iframeResizer.contentWindow.min.js"\n\t\t\tasync\n\t\t></script>', '')
+        file_contents = file_contents.replace('cdnjs.cloudflare.com', b'127.0.0.1')
         file_contents = file_contents.replace(
-            b'</head>',
-            b'\n    <script src="file/js/katex/katex.min.js"></script>'
-            b'\n    <script src="file/js/katex/auto-render.min.js"></script>'
-            b'\n    <script src="file/js/highlightjs/highlight.min.js"></script>'
-            b'\n    <script src="file/js/highlightjs/highlightjs-copy.min.js"></script>'
-            b'\n    <script>hljs.addPlugin(new CopyButtonPlugin());</script>'
-            b'\n  </head>'
+            '</head>',
+            '\n    <script src="file/js/katex/katex.min.js"></script>'
+            '\n    <script src="file/js/katex/auto-render.min.js"></script>'
+            '\n    <script src="file/js/highlightjs/highlight.min.js"></script>'
+            '\n    <script src="file/js/highlightjs/highlightjs-copy.min.js"></script>'
+            '\n    <script>hljs.addPlugin(new CopyButtonPlugin());</script>'
+            '\n  </head>'
         )
 
-        return io.BytesIO(file_contents)
+        return io.StringIO(file_contents)
     else:
         return original_open(*args, **kwargs)
 

From 5e8dc56f8a70628113d2d544b033187d07b918dd Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 23 Jun 2024 21:58:28 -0700
Subject: [PATCH 43/64] Fix after previous commit

---
 modules/block_requests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/block_requests.py b/modules/block_requests.py
index e95cd7e2..1bfa9a99 100644
--- a/modules/block_requests.py
+++ b/modules/block_requests.py
@@ -44,7 +44,7 @@ def my_open(*args, **kwargs):
             file_contents = f.read()
 
         file_contents = file_contents.replace('\t\t<script\n\t\t\tsrc="https://cdnjs.cloudflare.com/ajax/libs/iframe-resizer/4.3.9/iframeResizer.contentWindow.min.js"\n\t\t\tasync\n\t\t></script>', '')
-        file_contents = file_contents.replace('cdnjs.cloudflare.com', b'127.0.0.1')
+        file_contents = file_contents.replace('cdnjs.cloudflare.com', '127.0.0.1')
         file_contents = file_contents.replace(
             '</head>',
             '\n    <script src="file/js/katex/katex.min.js"></script>'

From b48ab482f89f50b897a748ca2097329954bed006 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 23 Jun 2024 22:05:19 -0700
Subject: [PATCH 44/64] Remove obsolete "gptq_for_llama_info" message

---
 modules/ui_model_menu.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index f8141bd1..53a9a238 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -147,7 +147,6 @@ def create_ui():
 
                             shared.gradio['disable_exllama'] = gr.Checkbox(label="disable_exllama", value=shared.args.disable_exllama, info='Disable ExLlama kernel for GPTQ models.')
                             shared.gradio['disable_exllamav2'] = gr.Checkbox(label="disable_exllamav2", value=shared.args.disable_exllamav2, info='Disable ExLlamav2 kernel for GPTQ models.')
-                            shared.gradio['gptq_for_llama_info'] = gr.Markdown('Legacy loader for compatibility with older GPUs. ExLlamav2_HF or AutoGPTQ are preferred for GPTQ models when supported.')
                             shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.")
                             shared.gradio['llamacpp_HF_info'] = gr.Markdown("llamacpp_HF loads llama.cpp as a Transformers model. To use it, you need to place your GGUF in a subfolder of models/ with the necessary tokenizer files.\n\nYou can use the \"llamacpp_HF creator\" menu to do that automatically.")
 

From 536f8d58d43dd70517d258611b49e9dade4d798e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 23 Jun 2024 22:09:24 -0700
Subject: [PATCH 45/64] Do not expose alpha_value to llama.cpp & rope_freq_base
 to transformers

To avoid confusion
---
 modules/RoPE.py           | 18 ------------------
 modules/llamacpp_hf.py    |  4 ++--
 modules/llamacpp_model.py |  4 ++--
 modules/loaders.py        |  3 ---
 modules/models.py         |  4 ++--
 5 files changed, 6 insertions(+), 27 deletions(-)
 delete mode 100644 modules/RoPE.py

diff --git a/modules/RoPE.py b/modules/RoPE.py
deleted file mode 100644
index 31163a33..00000000
--- a/modules/RoPE.py
+++ /dev/null
@@ -1,18 +0,0 @@
-def get_alpha_value(alpha, base):
-    '''
-    Gets alpha_value from alpha_value and rope_freq_base
-    '''
-    if base > 0:
-        return (base / 10000.) ** (63 / 64.)
-    else:
-        return alpha
-
-
-def get_rope_freq_base(alpha, base):
-    '''
-    Gets rope_freq_base from alpha_value and rope_freq_base
-    '''
-    if base > 0:
-        return base
-    else:
-        return 10000 * alpha ** (64 / 63.)
diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py
index f30be66a..74af5fbf 100644
--- a/modules/llamacpp_hf.py
+++ b/modules/llamacpp_hf.py
@@ -7,7 +7,7 @@ from torch.nn import CrossEntropyLoss
 from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel
 from transformers.modeling_outputs import CausalLMOutputWithPast
 
-from modules import RoPE, llama_cpp_python_hijack, shared
+from modules import llama_cpp_python_hijack, shared
 from modules.logging_colors import logger
 
 try:
@@ -212,7 +212,7 @@ class LlamacppHF(PreTrainedModel):
             'mul_mat_q': not shared.args.no_mul_mat_q,
             'numa': shared.args.numa,
             'n_gpu_layers': shared.args.n_gpu_layers,
-            'rope_freq_base': RoPE.get_rope_freq_base(shared.args.alpha_value, shared.args.rope_freq_base),
+            'rope_freq_base': shared.args.rope_freq_base,
             'tensor_split': tensor_split_list,
             'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
             'logits_all': shared.args.logits_all,
diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py
index b2a25d36..d62fd517 100644
--- a/modules/llamacpp_model.py
+++ b/modules/llamacpp_model.py
@@ -4,7 +4,7 @@ from functools import partial
 import numpy as np
 import torch
 
-from modules import RoPE, llama_cpp_python_hijack, shared
+from modules import llama_cpp_python_hijack, shared
 from modules.callbacks import Iteratorize
 from modules.logging_colors import logger
 from modules.text_generation import get_max_prompt_length
@@ -92,7 +92,7 @@ class LlamaCppModel:
             'mul_mat_q': not shared.args.no_mul_mat_q,
             'numa': shared.args.numa,
             'n_gpu_layers': shared.args.n_gpu_layers,
-            'rope_freq_base': RoPE.get_rope_freq_base(shared.args.alpha_value, shared.args.rope_freq_base),
+            'rope_freq_base': shared.args.rope_freq_base,
             'tensor_split': tensor_split_list,
             'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
             'offload_kqv': not shared.args.no_offload_kqv,
diff --git a/modules/loaders.py b/modules/loaders.py
index 5099ffb0..7bf1cde4 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -22,7 +22,6 @@ loaders_and_params = OrderedDict({
         'no_use_fast',
         'use_flash_attention_2',
         'alpha_value',
-        'rope_freq_base',
         'compress_pos_emb',
         'disable_exllama',
         'disable_exllamav2',
@@ -38,7 +37,6 @@ loaders_and_params = OrderedDict({
         'no_mmap',
         'mlock',
         'no_mul_mat_q',
-        'alpha_value',
         'rope_freq_base',
         'compress_pos_emb',
         'cpu',
@@ -60,7 +58,6 @@ loaders_and_params = OrderedDict({
         'no_mmap',
         'mlock',
         'no_mul_mat_q',
-        'alpha_value',
         'rope_freq_base',
         'compress_pos_emb',
         'cpu',
diff --git a/modules/models.py b/modules/models.py
index cb32a3da..bd54c146 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -25,7 +25,7 @@ from transformers import (
 )
 
 import modules.shared as shared
-from modules import RoPE, sampler_hijack
+from modules import sampler_hijack
 from modules.logging_colors import logger
 from modules.models_settings import get_model_metadata
 
@@ -248,7 +248,7 @@ def huggingface_loader(model_name):
         if shared.args.compress_pos_emb > 1:
             params['rope_scaling'] = {'type': 'linear', 'factor': shared.args.compress_pos_emb}
         elif shared.args.alpha_value > 1:
-            params['rope_scaling'] = {'type': 'dynamic', 'factor': RoPE.get_alpha_value(shared.args.alpha_value, shared.args.rope_freq_base)}
+            params['rope_scaling'] = {'type': 'dynamic', 'factor': shared.args.alpha_value}
 
         logger.info("TRANSFORMERS_PARAMS=")
         pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(params)

From 577a8cd3ee9dcc5366760d6ab02b92edcd9f2df9 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 24 Jun 2024 02:30:03 -0300
Subject: [PATCH 46/64] Add TensorRT-LLM support (#5715)

---
 docker/TensorRT-LLM/Dockerfile |  27 +++++++
 modules/loaders.py             |  15 ++++
 modules/models.py              |  10 ++-
 modules/models_settings.py     |   3 +
 modules/shared.py              |   6 ++
 modules/tensorrt_llm.py        | 131 +++++++++++++++++++++++++++++++++
 modules/text_generation.py     |   6 +-
 modules/ui.py                  |   1 +
 modules/ui_model_menu.py       |   2 +
 9 files changed, 197 insertions(+), 4 deletions(-)
 create mode 100644 docker/TensorRT-LLM/Dockerfile
 create mode 100644 modules/tensorrt_llm.py

diff --git a/docker/TensorRT-LLM/Dockerfile b/docker/TensorRT-LLM/Dockerfile
new file mode 100644
index 00000000..ae503c94
--- /dev/null
+++ b/docker/TensorRT-LLM/Dockerfile
@@ -0,0 +1,27 @@
+FROM pytorch/pytorch:2.2.1-cuda12.1-cudnn8-runtime
+
+# Install Git
+RUN apt update && apt install -y git
+
+# System-wide TensorRT-LLM requirements
+RUN apt install -y openmpi-bin libopenmpi-dev
+
+# Set the working directory
+WORKDIR /app
+
+# Install text-generation-webui
+RUN git clone https://github.com/oobabooga/text-generation-webui
+WORKDIR /app/text-generation-webui
+RUN pip install -r requirements.txt
+
+# This is needed to avoid an error about "Failed to build mpi4py" in the next command
+ENV LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
+
+# Install TensorRT-LLM
+RUN pip3 install tensorrt_llm==0.10.0 -U --pre --extra-index-url https://pypi.nvidia.com
+
+# Expose the necessary port for the Python server
+EXPOSE 7860 5000
+
+# Run the Python server.py script with the specified command
+CMD ["python", "server.py", "--api", "--listen"]
diff --git a/modules/loaders.py b/modules/loaders.py
index 7bf1cde4..1da37595 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -131,6 +131,11 @@ loaders_and_params = OrderedDict({
         'hqq_backend',
         'trust_remote_code',
         'no_use_fast',
+    ],
+    'TensorRT-LLM': [
+        'max_seq_len',
+        'cpp_runner',
+        'tensorrt_llm_info',
     ]
 })
 
@@ -316,6 +321,16 @@ loaders_samplers = {
         'skip_special_tokens',
         'auto_max_new_tokens',
     },
+    'TensorRT-LLM': {
+        'temperature',
+        'top_p',
+        'top_k',
+        'repetition_penalty',
+        'presence_penalty',
+        'frequency_penalty',
+        'ban_eos_token',
+        'auto_max_new_tokens',
+    }
 }
 
 
diff --git a/modules/models.py b/modules/models.py
index bd54c146..da741cb0 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -77,6 +77,7 @@ def load_model(model_name, loader=None):
         'ExLlamav2_HF': ExLlamav2_HF_loader,
         'AutoAWQ': AutoAWQ_loader,
         'HQQ': HQQ_loader,
+        'TensorRT-LLM': TensorRT_LLM_loader,
     }
 
     metadata = get_model_metadata(model_name)
@@ -101,7 +102,7 @@ def load_model(model_name, loader=None):
             tokenizer = load_tokenizer(model_name, model)
 
     shared.settings.update({k: v for k, v in metadata.items() if k in shared.settings})
-    if loader.lower().startswith('exllama'):
+    if loader.lower().startswith('exllama') or loader.lower().startswith('tensorrt'):
         shared.settings['truncation_length'] = shared.args.max_seq_len
     elif loader in ['llama.cpp', 'llamacpp_HF']:
         shared.settings['truncation_length'] = shared.args.n_ctx
@@ -337,6 +338,13 @@ def HQQ_loader(model_name):
     return model
 
 
+def TensorRT_LLM_loader(model_name):
+    from modules.tensorrt_llm import TensorRTLLMModel
+
+    model = TensorRTLLMModel.from_pretrained(model_name)
+    return model
+
+
 def get_max_memory_dict():
     max_memory = {}
     max_cpu_memory = shared.args.cpu_memory.strip() if shared.args.cpu_memory is not None else '99GiB'
diff --git a/modules/models_settings.py b/modules/models_settings.py
index e9645c96..387c5658 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -81,6 +81,9 @@ def get_model_metadata(model):
         # Transformers metadata
         if hf_metadata is not None:
             metadata = json.loads(open(path, 'r', encoding='utf-8').read())
+            if 'pretrained_config' in metadata:
+                metadata = metadata['pretrained_config']
+
             for k in ['max_position_embeddings', 'model_max_length', 'max_seq_len']:
                 if k in metadata:
                     model_settings['truncation_length'] = metadata[k]
diff --git a/modules/shared.py b/modules/shared.py
index 373089dc..ebbfc268 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -165,6 +165,10 @@ group.add_argument('--no_inject_fused_attention', action='store_true', help='Dis
 group = parser.add_argument_group('HQQ')
 group.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.')
 
+# TensorRT-LLM
+group = parser.add_argument_group('TensorRT-LLM')
+group.add_argument('--cpp-runner', action='store_true', help='Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn\'t support streaming yet.')
+
 # DeepSpeed
 group = parser.add_argument_group('DeepSpeed')
 group.add_argument('--deepspeed', action='store_true', help='Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.')
@@ -263,6 +267,8 @@ def fix_loader_name(name):
         return 'AutoAWQ'
     elif name in ['hqq']:
         return 'HQQ'
+    elif name in ['tensorrt', 'tensorrtllm', 'tensorrt_llm', 'tensorrt-llm', 'tensort', 'tensortllm']:
+        return 'TensorRT-LLM'
 
 
 def add_extension(name, last=False):
diff --git a/modules/tensorrt_llm.py b/modules/tensorrt_llm.py
new file mode 100644
index 00000000..c2685b75
--- /dev/null
+++ b/modules/tensorrt_llm.py
@@ -0,0 +1,131 @@
+from pathlib import Path
+
+import tensorrt_llm
+import torch
+from tensorrt_llm.runtime import ModelRunner, ModelRunnerCpp
+
+from modules import shared
+from modules.logging_colors import logger
+from modules.text_generation import (
+    get_max_prompt_length,
+    get_reply_from_output_ids
+)
+
+
+class TensorRTLLMModel:
+    def __init__(self):
+        pass
+
+    @classmethod
+    def from_pretrained(self, path_to_model):
+
+        path_to_model = Path(f'{shared.args.model_dir}') / Path(path_to_model)
+        runtime_rank = tensorrt_llm.mpi_rank()
+
+        # Define model settings
+        runner_kwargs = dict(
+            engine_dir=str(path_to_model),
+            lora_dir=None,
+            rank=runtime_rank,
+            debug_mode=False,
+            lora_ckpt_source="hf",
+        )
+
+        if shared.args.cpp_runner:
+            logger.info("TensorRT-LLM: Using \"ModelRunnerCpp\"")
+            runner_kwargs.update(
+                max_batch_size=1,
+                max_input_len=shared.args.max_seq_len - 512,
+                max_output_len=512,
+                max_beam_width=1,
+                max_attention_window_size=None,
+                sink_token_length=None,
+            )
+        else:
+            logger.info("TensorRT-LLM: Using \"ModelRunner\"")
+
+        # Load the model
+        runner_cls = ModelRunnerCpp if shared.args.cpp_runner else ModelRunner
+        runner = runner_cls.from_dir(**runner_kwargs)
+
+        result = self()
+        result.model = runner
+        result.runtime_rank = runtime_rank
+
+        return result
+
+    def generate_with_streaming(self, prompt, state):
+        batch_input_ids = []
+        input_ids = shared.tokenizer.encode(
+            prompt,
+            add_special_tokens=True,
+            truncation=False,
+        )
+        input_ids = torch.tensor(input_ids, dtype=torch.int32)
+        input_ids = input_ids[-get_max_prompt_length(state):]  # Apply truncation_length
+        batch_input_ids.append(input_ids)
+
+        if shared.args.cpp_runner:
+            max_new_tokens = min(512, state['max_new_tokens'])
+        elif state['auto_max_new_tokens']:
+            max_new_tokens = state['truncation_length'] - input_ids.shape[-1]
+        else:
+            max_new_tokens = state['max_new_tokens']
+
+        with torch.no_grad():
+            generator = self.model.generate(
+                batch_input_ids,
+                max_new_tokens=max_new_tokens,
+                max_attention_window_size=None,
+                sink_token_length=None,
+                end_id=shared.tokenizer.eos_token_id if not state['ban_eos_token'] else -1,
+                pad_id=shared.tokenizer.pad_token_id or shared.tokenizer.eos_token_id,
+                temperature=state['temperature'],
+                top_k=state['top_k'],
+                top_p=state['top_p'],
+                num_beams=1,
+                length_penalty=1.0,
+                repetition_penalty=state['repetition_penalty'],
+                presence_penalty=state['presence_penalty'],
+                frequency_penalty=state['frequency_penalty'],
+                stop_words_list=None,
+                bad_words_list=None,
+                lora_uids=None,
+                prompt_table_path=None,
+                prompt_tasks=None,
+                streaming=not shared.args.cpp_runner,
+                output_sequence_lengths=True,
+                return_dict=True,
+                medusa_choices=None
+            )
+
+        torch.cuda.synchronize()
+
+        cumulative_reply = ''
+        starting_from = batch_input_ids[0].shape[-1]
+
+        if shared.args.cpp_runner:
+            sequence_length = generator['sequence_lengths'][0].item()
+            output_ids = generator['output_ids'][0][0][:sequence_length].tolist()
+
+            cumulative_reply += get_reply_from_output_ids(output_ids, state, starting_from=starting_from)
+            starting_from = sequence_length
+            yield cumulative_reply
+        else:
+            for curr_outputs in generator:
+                if shared.stop_everything:
+                    break
+
+                sequence_length = curr_outputs['sequence_lengths'][0].item()
+                output_ids = curr_outputs['output_ids'][0][0][:sequence_length].tolist()
+
+                cumulative_reply += get_reply_from_output_ids(output_ids, state, starting_from=starting_from)
+                starting_from = sequence_length
+                yield cumulative_reply
+
+    def generate(self, prompt, state):
+        output = ''
+        for output in self.generate_with_streaming(prompt, state):
+            pass
+
+        return output
diff --git a/modules/text_generation.py b/modules/text_generation.py
index ca42ba1f..d971a30e 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -54,7 +54,7 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
             yield ''
             return
 
-        if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model']:
+        if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'TensorRTLLMModel']:
             generate_func = generate_reply_custom
         else:
             generate_func = generate_reply_HF
@@ -132,7 +132,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
     if shared.tokenizer is None:
         raise ValueError('No tokenizer is loaded')
 
-    if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model']:
+    if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'TensorRTLLMModel']:
         input_ids = shared.tokenizer.encode(str(prompt))
         if shared.model.__class__.__name__ not in ['Exllamav2Model']:
             input_ids = np.array(input_ids).reshape(1, len(input_ids))
@@ -158,7 +158,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
     if truncation_length is not None:
         input_ids = input_ids[:, -truncation_length:]
 
-    if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model'] or shared.args.cpu:
+    if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'TensorRTLLMModel'] or shared.args.cpu:
         return input_ids
     elif shared.args.deepspeed:
         import deepspeed
diff --git a/modules/ui.py b/modules/ui.py
index f88c0a82..c20a7888 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -106,6 +106,7 @@ def list_model_elements():
         'streaming_llm',
         'attention_sink_size',
         'hqq_backend',
+        'cpp_runner',
     ]
     if is_torch_xpu_available():
         for i in range(torch.xpu.device_count()):
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 53a9a238..3ebcd126 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -139,6 +139,7 @@ def create_ui():
                             shared.gradio['autosplit'] = gr.Checkbox(label="autosplit", value=shared.args.autosplit, info='Automatically split the model tensors across the available GPUs.')
                             shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn, info='Force flash-attention to not be used.')
                             shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.')
+                            shared.gradio['cpp_runner'] = gr.Checkbox(label="cpp-runner", value=shared.args.cpp_runner, info='Enable inference with ModelRunnerCpp, which is faster than the default ModelRunner.')
                             shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.')
                             with gr.Blocks():
                                 shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code)
@@ -149,6 +150,7 @@ def create_ui():
                             shared.gradio['disable_exllamav2'] = gr.Checkbox(label="disable_exllamav2", value=shared.args.disable_exllamav2, info='Disable ExLlamav2 kernel for GPTQ models.')
                             shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.")
                             shared.gradio['llamacpp_HF_info'] = gr.Markdown("llamacpp_HF loads llama.cpp as a Transformers model. To use it, you need to place your GGUF in a subfolder of models/ with the necessary tokenizer files.\n\nYou can use the \"llamacpp_HF creator\" menu to do that automatically.")
+                            shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `max_seq_len` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.')
 
             with gr.Column():
                 with gr.Row():

From 564a3e15532f9b71ebc85f160d02312b08b66638 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 23 Jun 2024 22:31:07 -0700
Subject: [PATCH 47/64] Remove the awkward "Tab" keyboard shortcut

---
 js/main.js | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/js/main.js b/js/main.js
index 72568e1b..75c1e5eb 100644
--- a/js/main.js
+++ b/js/main.js
@@ -98,20 +98,6 @@ document.addEventListener("keydown", function(event) {
     document.getElementById("Impersonate").click();
   }
 
-  // Switch between tabs on Tab
-  else if (!event.ctrlKey && !event.shiftKey && !event.altKey && !event.metaKey && event.key === "Tab") {
-    event.preventDefault();
-    var parametersButton = document.getElementById("parameters-button");
-    var parentContainer = parametersButton.parentNode;
-    var selectedChild = parentContainer.querySelector(".selected");
-
-    if (selectedChild.id == "parameters-button") {
-      document.getElementById(previousTabId).click();
-    } else {
-      previousTabId = selectedChild.id;
-      parametersButton.click();
-    }
-  }
 });
 
 //------------------------------------------------

From 35f32d08bc4662b33c59702cae00ea04c2ccb40b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 23 Jun 2024 22:34:18 -0700
Subject: [PATCH 48/64] GitHub: Increase the stalebot time to 6 months

---
 .github/workflows/stale.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index fee54196..8eb03299 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -13,8 +13,8 @@ jobs:
       - uses: actions/stale@v5
         with:
           stale-issue-message: ""
-          close-issue-message: "This issue has been closed due to inactivity for 2 months. If you believe it is still relevant, please leave a comment below. You can tag a developer in your comment."
-          days-before-issue-stale: 60
+          close-issue-message: "This issue has been closed due to inactivity for 6 months. If you believe it is still relevant, please leave a comment below. You can tag a developer in your comment."
+          days-before-issue-stale: 180
           days-before-issue-close: 0
           stale-issue-label: "stale"
           days-before-pr-stale: -1

From 7db8b3b5326637b330676696078bf07d1889220e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 24 Jun 2024 05:38:11 -0700
Subject: [PATCH 49/64] Bump ExLlamaV2 to 0.1.6

---
 requirements.txt               | 10 +++++-----
 requirements_amd.txt           |  6 +++---
 requirements_amd_noavx2.txt    |  6 +++---
 requirements_apple_intel.txt   |  2 +-
 requirements_apple_silicon.txt |  2 +-
 requirements_noavx2.txt        | 10 +++++-----
 6 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 01510dea..fb8ce150 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -53,11 +53,11 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/te
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.75+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # CUDA wheels
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.2.2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.2.2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.6/exllamav2-0.1.6+cu121.torch2.2.2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.6/exllamav2-0.1.6+cu121.torch2.2.2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.6/exllamav2-0.1.6+cu121.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.6/exllamav2-0.1.6+cu121.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.6/exllamav2-0.1.6-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
 https://github.com/oobabooga/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.2.2cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.2.2cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
diff --git a/requirements_amd.txt b/requirements_amd.txt
index 8e7b24cd..ec985865 100644
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@@ -40,8 +40,8 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cp
 # AMD wheels
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.75+rocm5.6.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.75+rocm5.6.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+rocm5.6.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+rocm5.6.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.6/exllamav2-0.1.6+rocm5.6.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.6/exllamav2-0.1.6+rocm5.6.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.6/exllamav2-0.1.6-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
 https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.5/autoawq-0.2.5+rocm561-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.5/autoawq-0.2.5+rocm561-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt
index 65122f09..c7ca2635 100644
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@@ -38,8 +38,8 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cp
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # AMD wheels
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+rocm5.6.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+rocm5.6.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.6/exllamav2-0.1.6+rocm5.6.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.6/exllamav2-0.1.6+rocm5.6.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.6/exllamav2-0.1.6-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
 https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.5/autoawq-0.2.5+rocm561-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.5/autoawq-0.2.5+rocm561-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt
index 7fc9202b..807c4037 100644
--- a/requirements_apple_intel.txt
+++ b/requirements_apple_intel.txt
@@ -38,4 +38,4 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/me
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp310-cp310-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5-py3-none-any.whl
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.6/exllamav2-0.1.6-py3-none-any.whl
diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt
index 2f45cff5..4205df23 100644
--- a/requirements_apple_silicon.txt
+++ b/requirements_apple_silicon.txt
@@ -40,4 +40,4 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/me
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10"
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5-py3-none-any.whl
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.6/exllamav2-0.1.6-py3-none-any.whl
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index 4ddffe67..4b3da7df 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -53,11 +53,11 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/te
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.75+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # CUDA wheels
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.2.2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.2.2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.6/exllamav2-0.1.6+cu121.torch2.2.2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.6/exllamav2-0.1.6+cu121.torch2.2.2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.6/exllamav2-0.1.6+cu121.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.6/exllamav2-0.1.6+cu121.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.6/exllamav2-0.1.6-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
 https://github.com/oobabooga/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.2.2cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.2.2cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"

From 96ba53d916d24182993d75d6d20169c68096fdab Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 24 Jun 2024 15:50:45 -0700
Subject: [PATCH 50/64] Handle another fix after
 57119c1b300d8f7b1cc87c6aea9b1bf5a294d87f

---
 modules/block_requests.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/modules/block_requests.py b/modules/block_requests.py
index 1bfa9a99..778b9f5a 100644
--- a/modules/block_requests.py
+++ b/modules/block_requests.py
@@ -43,6 +43,9 @@ def my_open(*args, **kwargs):
         with original_open(*args, **kwargs) as f:
             file_contents = f.read()
 
+        if len(args) > 1 and args[1] == 'rb':
+            file_contents = file_contents.decode('utf-8')
+
         file_contents = file_contents.replace('\t\t<script\n\t\t\tsrc="https://cdnjs.cloudflare.com/ajax/libs/iframe-resizer/4.3.9/iframeResizer.contentWindow.min.js"\n\t\t\tasync\n\t\t></script>', '')
         file_contents = file_contents.replace('cdnjs.cloudflare.com', '127.0.0.1')
         file_contents = file_contents.replace(
@@ -55,7 +58,12 @@ def my_open(*args, **kwargs):
             '\n  </head>'
         )
 
-        return io.StringIO(file_contents)
+        if len(args) > 1 and args[1] == 'rb':
+            file_contents = file_contents.encode('utf-8')
+            return io.BytesIO(file_contents)
+        else:
+            return io.StringIO(file_contents)
+
     else:
         return original_open(*args, **kwargs)
 

From a43c2106179b06f1ecff66136439f8d656bcdcfa Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 25 Jun 2024 00:07:22 -0300
Subject: [PATCH 51/64] Improved past chats menu (#6158)

---
 css/main.css       | 51 +++++++++++++++++++++++++++++++++++++++++++++-
 js/main.js         |  5 +++++
 modules/ui_chat.py | 12 ++++++-----
 3 files changed, 62 insertions(+), 6 deletions(-)

diff --git a/css/main.css b/css/main.css
index 0c149d2c..498b3c6c 100644
--- a/css/main.css
+++ b/css/main.css
@@ -221,6 +221,7 @@ button {
 
 .pretty_scrollbar::-webkit-scrollbar {
     width: 7px;
+    height: 7px;
 }
 
 .pretty_scrollbar::-webkit-scrollbar-track {
@@ -245,6 +246,10 @@ button {
     background: #374151;
 }
 
+.pretty_scrollbar::-webkit-scrollbar-corner {
+    background: transparent;
+}
+
 audio {
     max-width: 100%;
 }
@@ -695,7 +700,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 @media screen and (width >= 1327px) {
     #past-chats-row {
         position: absolute;
-        top: 16px;
+        top: 36px;
         left: 0;
         width: calc(0.5*(var(--document-width) - 880px - 120px - 16px*2));
         max-width: 300px;
@@ -743,3 +748,47 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
         display: none;
     }
 }
+
+#past-chats {
+    max-height: calc(100vh - 195px);
+    overflow-y: scroll !important;
+    border-radius: 0;
+    scrollbar-width: none; /* Hide scrollbar in Firefox by default */
+}
+
+#past-chats label {
+    width: 100%;
+    background-color: transparent !important;
+    background: none;
+    border: 0;
+    border-radius: 0;
+    padding-top: 8px;
+    padding-bottom: 8px;
+}
+
+#past-chats > :nth-child(2) {
+    display: none;
+}
+
+#past-chats > :nth-child(3) {
+    gap: 0;
+}
+
+#past-chats::-webkit-scrollbar {
+    display: none;
+}
+
+#past-chats:hover {
+    scrollbar-width: auto;
+}
+
+#past-chats:hover::-webkit-scrollbar {
+    display: block;
+}
+
+@media screen and (width < 1327px) {
+    #past-chats {
+        max-height: 300px;
+    }
+}
+
diff --git a/js/main.js b/js/main.js
index 75c1e5eb..e9a980e2 100644
--- a/js/main.js
+++ b/js/main.js
@@ -534,3 +534,8 @@ document.querySelectorAll(".focus-on-chat-input").forEach(element => {
     document.querySelector("#chat-input textarea").focus();
   });
 });
+
+//------------------------------------------------
+// Fix a border around the "past chats" menu
+//------------------------------------------------
+document.getElementById("past-chats").parentNode.style.borderRadius = "0px";
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 203e55f9..d9bf4cb5 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -62,9 +62,6 @@ def create_ui():
 
         with gr.Row(elem_id='past-chats-row', elem_classes=['pretty_scrollbar']):
             with gr.Column():
-                with gr.Row():
-                    shared.gradio['unique_id'] = gr.Dropdown(label='Past chats', elem_classes=['slim-dropdown'], interactive=not mu)
-
                 with gr.Row():
                     shared.gradio['rename_chat'] = gr.Button('Rename', elem_classes='refresh-button', interactive=not mu)
                     shared.gradio['delete_chat'] = gr.Button('🗑️', elem_classes='refresh-button', interactive=not mu)
@@ -74,8 +71,13 @@ def create_ui():
 
                 with gr.Row(elem_id='rename-row'):
                     shared.gradio['rename_to'] = gr.Textbox(label='Rename to:', placeholder='New name', visible=False, elem_classes=['no-background'])
-                    shared.gradio['rename_to-confirm'] = gr.Button('Confirm', visible=False, elem_classes=['refresh-button', 'focus-on-chat-input'])
-                    shared.gradio['rename_to-cancel'] = gr.Button('Cancel', visible=False, elem_classes=['refresh-button', 'focus-on-chat-input'])
+                    with gr.Row():
+                        shared.gradio['rename_to-confirm'] = gr.Button('Confirm', visible=False, elem_classes=['refresh-button', 'focus-on-chat-input'])
+                        shared.gradio['rename_to-cancel'] = gr.Button('Cancel', visible=False, elem_classes=['refresh-button', 'focus-on-chat-input'])
+
+                gr.Markdown("Past chats")
+                with gr.Row():
+                    shared.gradio['unique_id'] = gr.Radio(label="", elem_classes=['slim-dropdown', 'pretty_scrollbar'], interactive=not mu, elem_id='past-chats')
 
         with gr.Row(elem_id='chat-controls', elem_classes=['pretty_scrollbar']):
             with gr.Column():

From 602b4555074c8d5740c9d154b30c72dec2e2d6fe Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 24 Jun 2024 20:26:38 -0700
Subject: [PATCH 52/64] Bump llama-cpp-python to 0.2.79

---
 requirements.txt                 | 24 ++++++++++++------------
 requirements_amd.txt             | 12 ++++++------
 requirements_amd_noavx2.txt      |  8 ++++----
 requirements_apple_intel.txt     | 12 ++++++------
 requirements_apple_silicon.txt   | 16 ++++++++--------
 requirements_cpu_only.txt        |  8 ++++----
 requirements_cpu_only_noavx2.txt |  8 ++++----
 requirements_noavx2.txt          | 24 ++++++++++++------------
 8 files changed, 56 insertions(+), 56 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index fb8ce150..fb35c7d8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -35,22 +35,22 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, no tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.75+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.75+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.75+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.75+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.79+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.79+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.79+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.79+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.75+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.75+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.75+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.75+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.79+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.79+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.79+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.79+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # CUDA wheels
 https://github.com/oobabooga/exllamav2/releases/download/v0.1.6/exllamav2-0.1.6+cu121.torch2.2.2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements_amd.txt b/requirements_amd.txt
index ec985865..5bb68522 100644
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@@ -32,14 +32,14 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.75+rocm5.6.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.75+rocm5.6.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.79+rocm5.6.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.79+rocm5.6.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.1.6/exllamav2-0.1.6+rocm5.6.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.1.6/exllamav2-0.1.6+rocm5.6.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.1.6/exllamav2-0.1.6-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt
index c7ca2635..eee2c662 100644
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@@ -32,10 +32,10 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # AMD wheels
 https://github.com/oobabooga/exllamav2/releases/download/v0.1.6/exllamav2-0.1.6+rocm5.6.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt
index 807c4037..61e3c47c 100644
--- a/requirements_apple_intel.txt
+++ b/requirements_apple_intel.txt
@@ -32,10 +32,10 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp311-cp311-macosx_11_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp310-cp310-macosx_11_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp311-cp311-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp310-cp310-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.79-cp311-cp311-macosx_11_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.79-cp310-cp310-macosx_11_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.79-cp311-cp311-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.79-cp310-cp310-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.79-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.79-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.1.6/exllamav2-0.1.6-py3-none-any.whl
diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt
index 4205df23..b9497470 100644
--- a/requirements_apple_silicon.txt
+++ b/requirements_apple_silicon.txt
@@ -32,12 +32,12 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp311-cp311-macosx_11_0_arm64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp310-cp310-macosx_11_0_arm64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp311-cp311-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp310-cp310-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.79-cp311-cp311-macosx_11_0_arm64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.79-cp310-cp310-macosx_11_0_arm64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.79-cp311-cp311-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.79-cp310-cp310-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.79-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.79-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.79-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.79-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.1.6/exllamav2-0.1.6-py3-none-any.whl
diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt
index b0634fe5..ad38f23d 100644
--- a/requirements_cpu_only.txt
+++ b/requirements_cpu_only.txt
@@ -32,7 +32,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt
index a40a68f0..0debda88 100644
--- a/requirements_cpu_only_noavx2.txt
+++ b/requirements_cpu_only_noavx2.txt
@@ -32,7 +32,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index 4b3da7df..78342a2e 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -35,22 +35,22 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, no tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.75+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.75+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.75+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.75+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.79+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.79+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.79+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.79+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.75+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.75+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.75+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.75+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.79+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.79+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.79+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.79+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # CUDA wheels
 https://github.com/oobabooga/exllamav2/releases/download/v0.1.6/exllamav2-0.1.6+cu121.torch2.2.2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"

From e7e1f5901e5f6e1fe98f3947611042beb130296a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 25 Jun 2024 15:01:43 -0300
Subject: [PATCH 53/64] Prompts in the "past chats" menu (#6160)

---
 css/main.css       |  7 -----
 js/main.js         | 28 -----------------
 modules/chat.py    | 75 +++++++++++++++++++++++++++++-----------------
 modules/ui_chat.py | 32 +++-----------------
 4 files changed, 52 insertions(+), 90 deletions(-)

diff --git a/css/main.css b/css/main.css
index 498b3c6c..be73a7cf 100644
--- a/css/main.css
+++ b/css/main.css
@@ -687,13 +687,6 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     margin-top: 6px !important;
 }
 
-/* ----------------------------------------------
-  Past chats menus
----------------------------------------------- */
-#rename-row label {
-    margin-top: var(--layout-gap);
-}
-
 /* ----------------------------------------------
   Past chat histories in a side bar on desktop
 ---------------------------------------------- */
diff --git a/js/main.js b/js/main.js
index e9a980e2..f8610095 100644
--- a/js/main.js
+++ b/js/main.js
@@ -488,34 +488,6 @@ function updateDocumentWidth() {
 updateDocumentWidth();
 window.addEventListener("resize", updateDocumentWidth);
 
-//------------------------------------------------
-// Focus on the rename text area when it becomes visible
-//------------------------------------------------
-const renameTextArea = document.getElementById("rename-row").querySelector("textarea");
-
-function respondToRenameVisibility(element, callback) {
-  var options = {
-    root: document.documentElement,
-  };
-
-  var observer = new IntersectionObserver((entries, observer) => {
-    entries.forEach(entry => {
-      callback(entry.intersectionRatio > 0);
-    });
-  }, options);
-
-  observer.observe(element);
-}
-
-
-function handleVisibilityChange(isVisible) {
-  if (isVisible) {
-    renameTextArea.focus();
-  }
-}
-
-respondToRenameVisibility(renameTextArea, handleVisibilityChange);
-
 //------------------------------------------------
 // Adjust the chat tab margin if no extension UI
 // is present at the bottom
diff --git a/modules/chat.py b/modules/chat.py
index 5da6ad3b..baa926fd 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -495,27 +495,9 @@ def save_history(history, unique_id, character, mode):
         f.write(json.dumps(history, indent=4, ensure_ascii=False))
 
 
-def rename_history(old_id, new_id, character, mode):
-    if shared.args.multi_user:
-        return
-
-    old_p = get_history_file_path(old_id, character, mode)
-    new_p = get_history_file_path(new_id, character, mode)
-    if new_p.parent != old_p.parent:
-        logger.error(f"The following path is not allowed: \"{new_p}\".")
-    elif new_p == old_p:
-        logger.info("The provided path is identical to the old one.")
-    else:
-        logger.info(f"Renaming \"{old_p}\" to \"{new_p}\"")
-        old_p.rename(new_p)
-
-
-def find_all_histories(state):
-    if shared.args.multi_user:
-        return ['']
-
+def get_paths(state):
     if state['mode'] == 'instruct':
-        paths = Path('logs/instruct').glob('*.json')
+        return Path('logs/instruct').glob('*.json')
     else:
         character = state['character_menu']
 
@@ -533,12 +515,51 @@ def find_all_histories(state):
             p.parent.mkdir(exist_ok=True)
             new_p.rename(p)
 
-        paths = Path(f'logs/chat/{character}').glob('*.json')
+        return Path(f'logs/chat/{character}').glob('*.json')
 
+
+def find_all_histories(state):
+    if shared.args.multi_user:
+        return ['']
+
+    paths = get_paths(state)
     histories = sorted(paths, key=lambda x: x.stat().st_mtime, reverse=True)
-    histories = [path.stem for path in histories]
+    return [path.stem for path in histories]
 
-    return histories
+
+def find_all_histories_with_first_prompts(state):
+    if shared.args.multi_user:
+        return []
+
+    paths = get_paths(state)
+    histories = sorted(paths, key=lambda x: x.stat().st_mtime, reverse=True)
+
+    result = []
+    for i, path in enumerate(histories):
+        filename = path.stem
+        with open(path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+            if 'visible' in data and len(data['visible']) > 0:
+                if data['internal'][0][0] == '<|BEGIN-VISIBLE-CHAT|>':
+                    first_prompt = data['visible'][1][0] if len(data['visible']) > 1 else ''
+                else:
+                    first_prompt = data['visible'][0][0]
+
+                first_prompt = html.unescape(first_prompt)
+            elif i == 0:
+                first_prompt = "New chat"
+            else:
+                first_prompt = ''
+
+            first_prompt = first_prompt.strip()
+
+            # Truncate the first prompt if it's longer than 32 characters
+            if len(first_prompt) > 32:
+                first_prompt = first_prompt[:29] + '...'
+
+            result.append((first_prompt, filename))
+
+    return result
 
 
 def load_latest_history(state):
@@ -569,17 +590,17 @@ def load_history_after_deletion(state, idx):
     if shared.args.multi_user:
         return start_new_chat(state)
 
-    histories = find_all_histories(state)
+    histories = find_all_histories_with_first_prompts(state)
     idx = min(int(idx), len(histories) - 1)
     idx = max(0, idx)
 
     if len(histories) > 0:
-        history = load_history(histories[idx], state['character_menu'], state['mode'])
+        history = load_history(histories[idx][1], state['character_menu'], state['mode'])
     else:
         history = start_new_chat(state)
-        histories = find_all_histories(state)
+        histories = find_all_histories_with_first_prompts
 
-    return history, gr.update(choices=histories, value=histories[idx])
+    return history, gr.update(choices=histories, value=histories[idx][1])
 
 
 def update_character_menu_after_deletion(idx):
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index d9bf4cb5..b6ca97e2 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -63,18 +63,11 @@ def create_ui():
         with gr.Row(elem_id='past-chats-row', elem_classes=['pretty_scrollbar']):
             with gr.Column():
                 with gr.Row():
-                    shared.gradio['rename_chat'] = gr.Button('Rename', elem_classes='refresh-button', interactive=not mu)
                     shared.gradio['delete_chat'] = gr.Button('🗑️', elem_classes='refresh-button', interactive=not mu)
                     shared.gradio['delete_chat-confirm'] = gr.Button('Confirm', variant='stop', visible=False, elem_classes=['refresh-button', 'focus-on-chat-input'])
                     shared.gradio['delete_chat-cancel'] = gr.Button('Cancel', visible=False, elem_classes=['refresh-button', 'focus-on-chat-input'])
                     shared.gradio['Start new chat'] = gr.Button('New chat', elem_classes=['refresh-button', 'focus-on-chat-input'])
 
-                with gr.Row(elem_id='rename-row'):
-                    shared.gradio['rename_to'] = gr.Textbox(label='Rename to:', placeholder='New name', visible=False, elem_classes=['no-background'])
-                    with gr.Row():
-                        shared.gradio['rename_to-confirm'] = gr.Button('Confirm', visible=False, elem_classes=['refresh-button', 'focus-on-chat-input'])
-                        shared.gradio['rename_to-cancel'] = gr.Button('Cancel', visible=False, elem_classes=['refresh-button', 'focus-on-chat-input'])
-
                 gr.Markdown("Past chats")
                 with gr.Row():
                     shared.gradio['unique_id'] = gr.Radio(label="", elem_classes=['slim-dropdown', 'pretty_scrollbar'], interactive=not mu, elem_id='past-chats')
@@ -254,7 +247,7 @@ def create_event_handlers():
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.start_new_chat, gradio('interface_state'), gradio('history')).then(
         chat.redraw_html, gradio(reload_arr), gradio('display')).then(
-        lambda x: gr.update(choices=(histories := chat.find_all_histories(x)), value=histories[0]), gradio('interface_state'), gradio('unique_id'))
+        lambda x: gr.update(choices=(histories := chat.find_all_histories_with_first_prompts(x)), value=histories[0][1]), gradio('interface_state'), gradio('unique_id'))
 
     shared.gradio['delete_chat'].click(lambda: [gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)], None, gradio(clear_arr))
     shared.gradio['delete_chat-cancel'].click(lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, gradio(clear_arr))
@@ -266,29 +259,12 @@ def create_event_handlers():
         chat.redraw_html, gradio(reload_arr), gradio('display')).then(
         lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, gradio(clear_arr))
 
-    shared.gradio['rename_chat'].click(
-        lambda x: x, gradio('unique_id'), gradio('rename_to')).then(
-        lambda: [gr.update(visible=True)] * 3, None, gradio('rename_to', 'rename_to-confirm', 'rename_to-cancel'), show_progress=False)
-
-    shared.gradio['rename_to-cancel'].click(
-        lambda: [gr.update(visible=False)] * 3, None, gradio('rename_to', 'rename_to-confirm', 'rename_to-cancel'), show_progress=False)
-
-    shared.gradio['rename_to-confirm'].click(
-        chat.rename_history, gradio('unique_id', 'rename_to', 'character_menu', 'mode'), None).then(
-        lambda: [gr.update(visible=False)] * 3, None, gradio('rename_to', 'rename_to-confirm', 'rename_to-cancel'), show_progress=False).then(
-        lambda x, y: gr.update(choices=chat.find_all_histories(x), value=y), gradio('interface_state', 'rename_to'), gradio('unique_id'))
-
-    shared.gradio['rename_to'].submit(
-        chat.rename_history, gradio('unique_id', 'rename_to', 'character_menu', 'mode'), None).then(
-        lambda: [gr.update(visible=False)] * 3, None, gradio('rename_to', 'rename_to-confirm', 'rename_to-cancel'), show_progress=False).then(
-        lambda x, y: gr.update(choices=chat.find_all_histories(x), value=y), gradio('interface_state', 'rename_to'), gradio('unique_id'))
-
     shared.gradio['load_chat_history'].upload(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.start_new_chat, gradio('interface_state'), gradio('history')).then(
         chat.load_history_json, gradio('load_chat_history', 'history'), gradio('history')).then(
         chat.redraw_html, gradio(reload_arr), gradio('display')).then(
-        lambda x: gr.update(choices=(histories := chat.find_all_histories(x)), value=histories[0]), gradio('interface_state'), gradio('unique_id')).then(
+        lambda x: gr.update(choices=(histories := chat.find_all_histories_with_first_prompts(x)), value=histories[0][1]), gradio('interface_state'), gradio('unique_id')).then(
         chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None).then(
         None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_chat()}}')
 
@@ -297,7 +273,7 @@ def create_event_handlers():
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.load_latest_history, gradio('interface_state'), gradio('history')).then(
         chat.redraw_html, gradio(reload_arr), gradio('display')).then(
-        lambda x: gr.update(choices=(histories := chat.find_all_histories(x)), value=histories[0]), gradio('interface_state'), gradio('unique_id')).then(
+        lambda x: gr.update(choices=(histories := chat.find_all_histories_with_first_prompts(x)), value=histories[0][1]), gradio('interface_state'), gradio('unique_id')).then(
         None, None, None, js=f'() => {{{ui.update_big_picture_js}; updateBigPicture()}}')
 
     shared.gradio['mode'].change(
@@ -305,7 +281,7 @@ def create_event_handlers():
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.load_latest_history, gradio('interface_state'), gradio('history')).then(
         chat.redraw_html, gradio(reload_arr), gradio('display')).then(
-        lambda x: gr.update(choices=(histories := chat.find_all_histories(x)), value=histories[0]), gradio('interface_state'), gradio('unique_id'))
+        lambda x: gr.update(choices=(histories := chat.find_all_histories_with_first_prompts(x)), value=histories[0][1]), gradio('interface_state'), gradio('unique_id'))
 
     shared.gradio['chat_style'].change(chat.redraw_html, gradio(reload_arr), gradio('display'))
     shared.gradio['Copy last reply'].click(chat.send_last_reply_to_input, gradio('history'), gradio('textbox'), show_progress=False)

From 3290edfad91e7e255ed5eb74a276884c17b0a16e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 25 Jun 2024 11:06:05 -0700
Subject: [PATCH 54/64] Bug fix: force chat history to be loaded on launch

---
 modules/ui_chat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index b6ca97e2..041ba1ec 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -78,7 +78,7 @@ def create_ui():
                     shared.gradio['start_with'] = gr.Textbox(label='Start reply with', placeholder='Sure thing!', value=shared.settings['start_with'], elem_classes=['add_scrollbar'])
 
                 with gr.Row():
-                    shared.gradio['mode'] = gr.Radio(choices=['chat', 'chat-instruct', 'instruct'], value='chat', label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template selected under Parameters > Instruction template must match the current model.', elem_id='chat-mode')
+                    shared.gradio['mode'] = gr.Radio(choices=['chat', 'chat-instruct', 'instruct'], label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template selected under Parameters > Instruction template must match the current model.', elem_id='chat-mode')
 
                 with gr.Row():
                     shared.gradio['chat_style'] = gr.Dropdown(choices=utils.get_available_chat_styles(), label='Chat style', value=shared.settings['chat_style'], visible=shared.settings['mode'] != 'instruct')

From 279cba607fd28b81af1c5185f170f470800264bb Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 25 Jun 2024 11:10:17 -0700
Subject: [PATCH 55/64] UI: don't show an animation when updating the "past
 chats" menu

---
 modules/ui_chat.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 041ba1ec..91f7e680 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -247,7 +247,7 @@ def create_event_handlers():
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.start_new_chat, gradio('interface_state'), gradio('history')).then(
         chat.redraw_html, gradio(reload_arr), gradio('display')).then(
-        lambda x: gr.update(choices=(histories := chat.find_all_histories_with_first_prompts(x)), value=histories[0][1]), gradio('interface_state'), gradio('unique_id'))
+        lambda x: gr.update(choices=(histories := chat.find_all_histories_with_first_prompts(x)), value=histories[0][1]), gradio('interface_state'), gradio('unique_id'), show_progress=False)
 
     shared.gradio['delete_chat'].click(lambda: [gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)], None, gradio(clear_arr))
     shared.gradio['delete_chat-cancel'].click(lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, gradio(clear_arr))
@@ -255,7 +255,7 @@ def create_event_handlers():
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         lambda x, y: str(chat.find_all_histories(x).index(y)), gradio('interface_state', 'unique_id'), gradio('temporary_text')).then(
         chat.delete_history, gradio('unique_id', 'character_menu', 'mode'), None).then(
-        chat.load_history_after_deletion, gradio('interface_state', 'temporary_text'), gradio('history', 'unique_id')).then(
+        chat.load_history_after_deletion, gradio('interface_state', 'temporary_text'), gradio('history', 'unique_id'), show_progress=False).then(
         chat.redraw_html, gradio(reload_arr), gradio('display')).then(
         lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, gradio(clear_arr))
 
@@ -264,7 +264,7 @@ def create_event_handlers():
         chat.start_new_chat, gradio('interface_state'), gradio('history')).then(
         chat.load_history_json, gradio('load_chat_history', 'history'), gradio('history')).then(
         chat.redraw_html, gradio(reload_arr), gradio('display')).then(
-        lambda x: gr.update(choices=(histories := chat.find_all_histories_with_first_prompts(x)), value=histories[0][1]), gradio('interface_state'), gradio('unique_id')).then(
+        lambda x: gr.update(choices=(histories := chat.find_all_histories_with_first_prompts(x)), value=histories[0][1]), gradio('interface_state'), gradio('unique_id'), show_progress=False).then(
         chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None).then(
         None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_chat()}}')
 
@@ -273,7 +273,7 @@ def create_event_handlers():
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.load_latest_history, gradio('interface_state'), gradio('history')).then(
         chat.redraw_html, gradio(reload_arr), gradio('display')).then(
-        lambda x: gr.update(choices=(histories := chat.find_all_histories_with_first_prompts(x)), value=histories[0][1]), gradio('interface_state'), gradio('unique_id')).then(
+        lambda x: gr.update(choices=(histories := chat.find_all_histories_with_first_prompts(x)), value=histories[0][1]), gradio('interface_state'), gradio('unique_id'), show_progress=False).then(
         None, None, None, js=f'() => {{{ui.update_big_picture_js}; updateBigPicture()}}')
 
     shared.gradio['mode'].change(
@@ -281,7 +281,7 @@ def create_event_handlers():
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.load_latest_history, gradio('interface_state'), gradio('history')).then(
         chat.redraw_html, gradio(reload_arr), gradio('display')).then(
-        lambda x: gr.update(choices=(histories := chat.find_all_histories_with_first_prompts(x)), value=histories[0][1]), gradio('interface_state'), gradio('unique_id'))
+        lambda x: gr.update(choices=(histories := chat.find_all_histories_with_first_prompts(x)), value=histories[0][1]), gradio('interface_state'), gradio('unique_id'), show_progress=False)
 
     shared.gradio['chat_style'].change(chat.redraw_html, gradio(reload_arr), gradio('display'))
     shared.gradio['Copy last reply'].click(chat.send_last_reply_to_input, gradio('history'), gradio('textbox'), show_progress=False)

From 83534798b226ec64e7e455d1f2449a33d7ef6da1 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 25 Jun 2024 11:25:57 -0700
Subject: [PATCH 56/64] UI: move "Character" dropdown to the main Chat tab

---
 modules/ui_chat.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 91f7e680..a3f4ea96 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -80,6 +80,11 @@ def create_ui():
                 with gr.Row():
                     shared.gradio['mode'] = gr.Radio(choices=['chat', 'chat-instruct', 'instruct'], label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template selected under Parameters > Instruction template must match the current model.', elem_id='chat-mode')
 
+                with gr.Row():
+                    shared.gradio['character_menu'] = gr.Dropdown(value=None, choices=utils.get_available_characters(), label='Character', elem_id='character-menu', elem_classes='slim-dropdown')
+                    shared.gradio['refresh_character'] = ui.create_refresh_button(shared.gradio['character_menu'], lambda: None, lambda: {'choices': utils.get_available_characters()}, 'refresh-button', interactive=not mu)
+                    shared.gradio['delete_character'] = gr.Button('🗑️', elem_classes='refresh-button', interactive=not mu)
+
                 with gr.Row():
                     shared.gradio['chat_style'] = gr.Dropdown(choices=utils.get_available_chat_styles(), label='Chat style', value=shared.settings['chat_style'], visible=shared.settings['mode'] != 'instruct')
 
@@ -93,15 +98,10 @@ def create_chat_settings_ui():
         with gr.Row():
             with gr.Column(scale=8):
                 with gr.Tab("Character"):
-                    with gr.Row():
-                        shared.gradio['character_menu'] = gr.Dropdown(value=None, choices=utils.get_available_characters(), label='Character', elem_id='character-menu', info='Used in chat and chat-instruct modes.', elem_classes='slim-dropdown')
-                        ui.create_refresh_button(shared.gradio['character_menu'], lambda: None, lambda: {'choices': utils.get_available_characters()}, 'refresh-button', interactive=not mu)
-                        shared.gradio['save_character'] = gr.Button('💾', elem_classes='refresh-button', interactive=not mu)
-                        shared.gradio['delete_character'] = gr.Button('🗑️', elem_classes='refresh-button', interactive=not mu)
-
                     shared.gradio['name2'] = gr.Textbox(value='', lines=1, label='Character\'s name')
                     shared.gradio['context'] = gr.Textbox(value='', lines=10, label='Context', elem_classes=['add_scrollbar'])
                     shared.gradio['greeting'] = gr.Textbox(value='', lines=5, label='Greeting', elem_classes=['add_scrollbar'])
+                    shared.gradio['save_character'] = gr.Button('Save character', elem_classes=['small-button'], interactive=not mu)
 
                 with gr.Tab("User"):
                     shared.gradio['name1'] = gr.Textbox(value=shared.settings['name1'], lines=1, label='Name')
@@ -277,7 +277,7 @@ def create_event_handlers():
         None, None, None, js=f'() => {{{ui.update_big_picture_js}; updateBigPicture()}}')
 
     shared.gradio['mode'].change(
-        lambda x: [gr.update(visible=x != 'instruct'), gr.update(visible=x == 'chat-instruct')], gradio('mode'), gradio('chat_style', 'chat-instruct_command'), show_progress=False).then(
+        lambda x: [gr.update(visible=(x != 'instruct'))] * 4 + [gr.update(visible=(x == 'chat-instruct'))], gradio('mode'), gradio('character_menu', 'refresh_character', 'delete_character', 'chat_style', 'chat-instruct_command'), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.load_latest_history, gradio('interface_state'), gradio('history')).then(
         chat.redraw_html, gradio(reload_arr), gradio('display')).then(

From 46ca15cb793dc8879b255ada56525d709c85210c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 25 Jun 2024 11:49:33 -0700
Subject: [PATCH 57/64] Minor bug fixes after
 e7e1f5901e5f6e1fe98f3947611042beb130296a

---
 modules/chat.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index baa926fd..31c1c012 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -539,17 +539,18 @@ def find_all_histories_with_first_prompts(state):
         filename = path.stem
         with open(path, 'r', encoding='utf-8') as f:
             data = json.load(f)
+
+            first_prompt = ""
             if 'visible' in data and len(data['visible']) > 0:
                 if data['internal'][0][0] == '<|BEGIN-VISIBLE-CHAT|>':
-                    first_prompt = data['visible'][1][0] if len(data['visible']) > 1 else ''
+                    if len(data['visible']) > 1:
+                        first_prompt = html.unescape(data['visible'][1][0])
+                    elif i == 0:
+                        first_prompt = "New chat"
                 else:
-                    first_prompt = data['visible'][0][0]
-
-                first_prompt = html.unescape(first_prompt)
+                    first_prompt = html.unescape(data['visible'][0][0])
             elif i == 0:
                 first_prompt = "New chat"
-            else:
-                first_prompt = ''
 
             first_prompt = first_prompt.strip()
 
@@ -598,7 +599,7 @@ def load_history_after_deletion(state, idx):
         history = load_history(histories[idx][1], state['character_menu'], state['mode'])
     else:
         history = start_new_chat(state)
-        histories = find_all_histories_with_first_prompts
+        histories = find_all_histories_with_first_prompts(state)
 
     return history, gr.update(choices=histories, value=histories[idx][1])
 

From ac3f92d36a0a32cc3feda0c7d276092df0b1cc05 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 25 Jun 2024 14:18:07 -0700
Subject: [PATCH 58/64] UI: store chat history in the browser

---
 modules/ui_chat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index a3f4ea96..ab8d720a 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -19,7 +19,7 @@ def create_ui():
     mu = shared.args.multi_user
 
     shared.gradio['Chat input'] = gr.State()
-    shared.gradio['history'] = gr.State({'internal': [], 'visible': []})
+    shared.gradio['history'] = gr.JSON({'internal': [], 'visible': []}, visible=False)
 
     with gr.Tab('Chat', elem_id='chat-tab', elem_classes=("old-ui" if shared.args.chat_buttons else None)):
         with gr.Row():

From 53fbd2f2457a6194048481e3a116a3114b05ac7d Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 25 Jun 2024 14:45:37 -0700
Subject: [PATCH 59/64] Add TensorRT-LLM to the README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 52605f5e..faeeb1e9 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 ## Features
 
 * 3 interface modes: default (two columns), notebook, and chat.
-* Multiple model backends: [Transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp) (through [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)), [ExLlamaV2](https://github.com/turboderp/exllamav2), [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ).
+* Multiple model backends: [Transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp) (through [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)), [ExLlamaV2](https://github.com/turboderp/exllamav2), [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM).
 * Dropdown menu for quickly switching between different models.
 * Large number of extensions (built-in and user-contributed), including Coqui TTS for realistic voice outputs, Whisper STT for voice inputs, translation, [multimodal pipelines](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/multimodal), vector databases, Stable Diffusion integration, and a lot more. See [the wiki](https://github.com/oobabooga/text-generation-webui/wiki/07-%E2%80%90-Extensions) and [the extensions directory](https://github.com/oobabooga/text-generation-webui-extensions) for details.
 * [Chat with custom characters](https://github.com/oobabooga/text-generation-webui/wiki/03-%E2%80%90-Parameters-Tab#character).

From 3691451d00f83da55f494f83586a39c68517afa9 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 25 Jun 2024 22:28:58 -0300
Subject: [PATCH 60/64] Add back the "Rename chat" feature (#6161)

---
 css/main.css       |  7 ++++++
 js/main.js         | 28 +++++++++++++++++++++++
 modules/chat.py    | 56 +++++++++++++++++++++++++++++++---------------
 modules/ui_chat.py | 24 ++++++++++++++++++++
 4 files changed, 97 insertions(+), 18 deletions(-)

diff --git a/css/main.css b/css/main.css
index be73a7cf..498b3c6c 100644
--- a/css/main.css
+++ b/css/main.css
@@ -687,6 +687,13 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     margin-top: 6px !important;
 }
 
+/* ----------------------------------------------
+  Past chats menus
+---------------------------------------------- */
+#rename-row label {
+    margin-top: var(--layout-gap);
+}
+
 /* ----------------------------------------------
   Past chat histories in a side bar on desktop
 ---------------------------------------------- */
diff --git a/js/main.js b/js/main.js
index f8610095..e9a980e2 100644
--- a/js/main.js
+++ b/js/main.js
@@ -488,6 +488,34 @@ function updateDocumentWidth() {
 updateDocumentWidth();
 window.addEventListener("resize", updateDocumentWidth);
 
+//------------------------------------------------
+// Focus on the rename text area when it becomes visible
+//------------------------------------------------
+const renameTextArea = document.getElementById("rename-row").querySelector("textarea");
+
+function respondToRenameVisibility(element, callback) {
+  var options = {
+    root: document.documentElement,
+  };
+
+  var observer = new IntersectionObserver((entries, observer) => {
+    entries.forEach(entry => {
+      callback(entry.intersectionRatio > 0);
+    });
+  }, options);
+
+  observer.observe(element);
+}
+
+
+function handleVisibilityChange(isVisible) {
+  if (isVisible) {
+    renameTextArea.focus();
+  }
+}
+
+respondToRenameVisibility(renameTextArea, handleVisibilityChange);
+
 //------------------------------------------------
 // Adjust the chat tab margin if no extension UI
 // is present at the bottom
diff --git a/modules/chat.py b/modules/chat.py
index 31c1c012..5d2bdd63 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -495,6 +495,23 @@ def save_history(history, unique_id, character, mode):
         f.write(json.dumps(history, indent=4, ensure_ascii=False))
 
 
+def rename_history(old_id, new_id, character, mode):
+    if shared.args.multi_user:
+        return
+
+    old_p = get_history_file_path(old_id, character, mode)
+    new_p = get_history_file_path(new_id, character, mode)
+    if new_p.parent != old_p.parent:
+        logger.error(f"The following path is not allowed: \"{new_p}\".")
+    elif new_p == old_p:
+        logger.info("The provided path is identical to the old one.")
+    elif new_p.exists():
+        logger.error(f"The new path already exists and will not be overwritten: \"{new_p}\".")
+    else:
+        logger.info(f"Renaming \"{old_p}\" to \"{new_p}\"")
+        old_p.rename(new_p)
+
+
 def get_paths(state):
     if state['mode'] == 'instruct':
         return Path('logs/instruct').glob('*.json')
@@ -537,28 +554,31 @@ def find_all_histories_with_first_prompts(state):
     result = []
     for i, path in enumerate(histories):
         filename = path.stem
-        with open(path, 'r', encoding='utf-8') as f:
-            data = json.load(f)
+        if re.match(r'^[0-9]{8}-[0-9]{2}-[0-9]{2}-[0-9]{2}$', filename):
+            with open(path, 'r', encoding='utf-8') as f:
+                data = json.load(f)
 
-            first_prompt = ""
-            if 'visible' in data and len(data['visible']) > 0:
-                if data['internal'][0][0] == '<|BEGIN-VISIBLE-CHAT|>':
-                    if len(data['visible']) > 1:
-                        first_prompt = html.unescape(data['visible'][1][0])
-                    elif i == 0:
-                        first_prompt = "New chat"
-                else:
-                    first_prompt = html.unescape(data['visible'][0][0])
-            elif i == 0:
-                first_prompt = "New chat"
+                first_prompt = ""
+                if 'visible' in data and len(data['visible']) > 0:
+                    if data['internal'][0][0] == '<|BEGIN-VISIBLE-CHAT|>':
+                        if len(data['visible']) > 1:
+                            first_prompt = html.unescape(data['visible'][1][0])
+                        elif i == 0:
+                            first_prompt = "New chat"
+                    else:
+                        first_prompt = html.unescape(data['visible'][0][0])
+                elif i == 0:
+                    first_prompt = "New chat"
+        else:
+            first_prompt = filename
 
-            first_prompt = first_prompt.strip()
+        first_prompt = first_prompt.strip()
 
-            # Truncate the first prompt if it's longer than 32 characters
-            if len(first_prompt) > 32:
-                first_prompt = first_prompt[:29] + '...'
+        # Truncate the first prompt if it's longer than 32 characters
+        if len(first_prompt) > 32:
+            first_prompt = first_prompt[:29] + '...'
 
-            result.append((first_prompt, filename))
+        result.append((first_prompt, filename))
 
     return result
 
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index ab8d720a..91951624 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -63,11 +63,18 @@ def create_ui():
         with gr.Row(elem_id='past-chats-row', elem_classes=['pretty_scrollbar']):
             with gr.Column():
                 with gr.Row():
+                    shared.gradio['rename_chat'] = gr.Button('Rename', elem_classes='refresh-button', interactive=not mu)
                     shared.gradio['delete_chat'] = gr.Button('🗑️', elem_classes='refresh-button', interactive=not mu)
                     shared.gradio['delete_chat-confirm'] = gr.Button('Confirm', variant='stop', visible=False, elem_classes=['refresh-button', 'focus-on-chat-input'])
                     shared.gradio['delete_chat-cancel'] = gr.Button('Cancel', visible=False, elem_classes=['refresh-button', 'focus-on-chat-input'])
                     shared.gradio['Start new chat'] = gr.Button('New chat', elem_classes=['refresh-button', 'focus-on-chat-input'])
 
+                with gr.Row(elem_id='rename-row'):
+                    shared.gradio['rename_to'] = gr.Textbox(label='Rename to:', placeholder='New name', visible=False, elem_classes=['no-background'])
+                    with gr.Row():
+                        shared.gradio['rename_to-confirm'] = gr.Button('Confirm', visible=False, elem_classes=['refresh-button', 'focus-on-chat-input'])
+                        shared.gradio['rename_to-cancel'] = gr.Button('Cancel', visible=False, elem_classes=['refresh-button', 'focus-on-chat-input'])
+
                 gr.Markdown("Past chats")
                 with gr.Row():
                     shared.gradio['unique_id'] = gr.Radio(label="", elem_classes=['slim-dropdown', 'pretty_scrollbar'], interactive=not mu, elem_id='past-chats')
@@ -259,6 +266,23 @@ def create_event_handlers():
         chat.redraw_html, gradio(reload_arr), gradio('display')).then(
         lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, gradio(clear_arr))
 
+    shared.gradio['rename_chat'].click(
+        lambda: "My New Chat", None, gradio('rename_to')).then(
+        lambda: [gr.update(visible=True)] * 3, None, gradio('rename_to', 'rename_to-confirm', 'rename_to-cancel'), show_progress=False)
+
+    shared.gradio['rename_to-cancel'].click(
+        lambda: [gr.update(visible=False)] * 3, None, gradio('rename_to', 'rename_to-confirm', 'rename_to-cancel'), show_progress=False)
+
+    shared.gradio['rename_to-confirm'].click(
+        chat.rename_history, gradio('unique_id', 'rename_to', 'character_menu', 'mode'), None).then(
+        lambda: [gr.update(visible=False)] * 3, None, gradio('rename_to', 'rename_to-confirm', 'rename_to-cancel'), show_progress=False).then(
+        lambda x, y: gr.update(choices=chat.find_all_histories_with_first_prompts(x), value=y), gradio('interface_state', 'rename_to'), gradio('unique_id'))
+
+    shared.gradio['rename_to'].submit(
+        chat.rename_history, gradio('unique_id', 'rename_to', 'character_menu', 'mode'), None).then(
+        lambda: [gr.update(visible=False)] * 3, None, gradio('rename_to', 'rename_to-confirm', 'rename_to-cancel'), show_progress=False).then(
+        lambda x, y: gr.update(choices=chat.find_all_histories_with_first_prompts(x), value=y), gradio('interface_state', 'rename_to'), gradio('unique_id'))
+
     shared.gradio['load_chat_history'].upload(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.start_new_chat, gradio('interface_state'), gradio('history')).then(

From b1187fc9a569946bb6292b4c22769ea81bf1ffcd Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 25 Jun 2024 19:19:45 -0700
Subject: [PATCH 61/64] UI: prevent flickering while streaming lists / bullet
 points

---
 modules/html_generator.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/modules/html_generator.py b/modules/html_generator.py
index 7e50f561..3966f2a1 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -85,15 +85,20 @@ def convert_to_markdown(string):
 
     # Unfinished list, like "\n1.". A |delete| string is added and then
     # removed to force a <ol> or <ul> to be generated instead of a <p>.
-    if re.search(r'(\n\d+\.?|\n\*\s*)$', result):
+    list_item_pattern = r'(\n\d+\.?|\n\*\s*|\n\s*[-*+]\s*)$'
+    if re.search(list_item_pattern, result):
         delete_str = '|delete|'
 
         if re.search(r'(\d+\.?)$', result) and not result.endswith('.'):
             result += '.'
 
-        result = re.sub(r'(\n\d+\.?|\n\*\s*)$', r'\g<1> ' + delete_str, result)
+        # Add the delete string after the list item
+        result = re.sub(list_item_pattern, r'\g<1> ' + delete_str, result)
 
+        # Convert to HTML using markdown
         html_output = markdown.markdown(result, extensions=['fenced_code', 'tables'])
+
+        # Remove the delete string from the HTML output
         pos = html_output.rfind(delete_str)
         if pos > -1:
             html_output = html_output[:pos] + html_output[pos + len(delete_str):]

From 5fe532a5ce3852398be4de21093fb641b440ff8f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 26 Jun 2024 15:33:11 -0700
Subject: [PATCH 62/64] UI: remove DRY info text

It was visible for loaders without DRY.
---
 modules/ui_parameters.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py
index d62b74c1..68512c7e 100644
--- a/modules/ui_parameters.py
+++ b/modules/ui_parameters.py
@@ -40,7 +40,6 @@ def create_ui(default_preset):
                             shared.gradio['do_sample'] = gr.Checkbox(value=generate_params['do_sample'], label='do_sample')
 
                             with gr.Blocks():
-                                gr.Markdown("[DRY sequence repetition penalty](https://github.com/oobabooga/text-generation-webui/pull/5677)")
                                 shared.gradio['dry_multiplier'] = gr.Slider(0, 5, value=generate_params['dry_multiplier'], step=0.01, label='dry_multiplier', info='Set to value > 0 to enable DRY. Controls the magnitude of the penalty for the shortest penalized sequences.')
                                 shared.gradio['dry_base'] = gr.Slider(1, 4, value=generate_params['dry_base'], step=0.01, label='dry_base', info='Controls how fast the penalty grows with increasing sequence length.')
                                 shared.gradio['dry_allowed_length'] = gr.Slider(1, 20, value=generate_params['dry_allowed_length'], step=1, label='dry_allowed_length', info='Longest sequence that can be repeated without being penalized.')

From a85749dcbef8c6bad119e31f8e10d665b1788835 Mon Sep 17 00:00:00 2001
From: mefich <github.z.kernelpanic@neverbox.com>
Date: Thu, 27 Jun 2024 06:17:56 +0500
Subject: [PATCH 63/64] Update models_settings.py: add default alpha_value, add
 proper compress_pos_emb for newer GGUFs (#6111)

---
 modules/models_settings.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/modules/models_settings.py b/modules/models_settings.py
index 387c5658..2e3fff9c 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -16,6 +16,7 @@ def get_fallback_settings():
         'n_ctx': 2048,
         'rope_freq_base': 0,
         'compress_pos_emb': 1,
+        'alpha_value': 1,
         'truncation_length': shared.settings['truncation_length'],
         'skip_special_tokens': shared.settings['skip_special_tokens'],
         'custom_stopping_strings': shared.settings['custom_stopping_strings'],
@@ -58,6 +59,8 @@ def get_model_metadata(model):
                 model_settings['rope_freq_base'] = metadata[k]
             elif k.endswith('rope.scale_linear'):
                 model_settings['compress_pos_emb'] = metadata[k]
+            elif k.endswith('rope.scaling.factor'):
+                model_settings['compress_pos_emb'] = metadata[k]
             elif k.endswith('block_count'):
                 model_settings['n_gpu_layers'] = metadata[k] + 1
 

From 8ec8bc0b85a27c82bdcdc3b24bc5d61b2fc903a7 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 26 Jun 2024 18:40:43 -0700
Subject: [PATCH 64/64] UI: handle another edge case while streaming lists

---
 modules/html_generator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/html_generator.py b/modules/html_generator.py
index 3966f2a1..657133bd 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -85,7 +85,7 @@ def convert_to_markdown(string):
 
     # Unfinished list, like "\n1.". A |delete| string is added and then
     # removed to force a <ol> or <ul> to be generated instead of a <p>.
-    list_item_pattern = r'(\n\d+\.?|\n\*\s*|\n\s*[-*+]\s*)$'
+    list_item_pattern = r'(\n\d+\.?|\n\s*[-*+]\s*([*_~]{1,3})?)$'
     if re.search(list_item_pattern, result):
         delete_str = '|delete|'