diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index 91abb11f..93aaf445 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -7,5 +7,6 @@ version: 2
 updates:
   - package-ecosystem: "pip" # See documentation for possible values
     directory: "/" # Location of package manifests
+    target-branch: "dev"
     schedule:
       interval: "weekly"
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index fee54196..8eb03299 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -13,8 +13,8 @@ jobs:
       - uses: actions/stale@v5
         with:
           stale-issue-message: ""
-          close-issue-message: "This issue has been closed due to inactivity for 2 months. If you believe it is still relevant, please leave a comment below. You can tag a developer in your comment."
-          days-before-issue-stale: 60
+          close-issue-message: "This issue has been closed due to inactivity for 6 months. If you believe it is still relevant, please leave a comment below. You can tag a developer in your comment."
+          days-before-issue-stale: 180
           days-before-issue-close: 0
           stale-issue-label: "stale"
           days-before-pr-stale: -1
diff --git a/README.md b/README.md
index a699a9ab..1b8de70a 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 ## Features
 
 * 3 interface modes: default (two columns), notebook, and chat.
-* Multiple model backends: [Transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp) (through [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)), [ExLlamaV2](https://github.com/turboderp/exllamav2), [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ).
+* Multiple model backends: [Transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp) (through [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)), [ExLlamaV2](https://github.com/turboderp/exllamav2), [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM).
 * Dropdown menu for quickly switching between different models.
 * Large number of extensions (built-in and user-contributed), including Coqui TTS for realistic voice outputs, Whisper STT for voice inputs, translation, [multimodal pipelines](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/multimodal), vector databases, Stable Diffusion integration, and a lot more. See [the wiki](https://github.com/oobabooga/text-generation-webui/wiki/07-%E2%80%90-Extensions) and [the extensions directory](https://github.com/oobabooga/text-generation-webui-extensions) for details.
 * [Chat with custom characters](https://github.com/oobabooga/text-generation-webui/wiki/03-%E2%80%90-Parameters-Tab#character).
@@ -76,12 +76,12 @@ conda activate textgen
 
 | System | GPU | Command |
 |--------|---------|---------|
-| Linux/WSL | NVIDIA | `pip3 install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cu121` |
-| Linux/WSL | CPU only | `pip3 install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cpu` |
-| Linux | AMD | `pip3 install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/rocm5.6` |
-| MacOS + MPS | Any | `pip3 install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1` |
-| Windows | NVIDIA | `pip3 install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cu121` |
-| Windows | CPU only | `pip3 install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1` |
+| Linux/WSL | NVIDIA | `pip3 install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121` |
+| Linux/WSL | CPU only | `pip3 install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cpu` |
+| Linux | AMD | `pip3 install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/rocm5.6` |
+| MacOS + MPS | Any | `pip3 install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2` |
+| Windows | NVIDIA | `pip3 install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121` |
+| Windows | CPU only | `pip3 install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2` |
 
 The up-to-date commands can be found here: https://pytorch.org/get-started/locally/.
 
@@ -146,7 +146,7 @@ Then browse to
 1) For Kepler GPUs and older, you will need to install CUDA 11.8 instead of 12:
 
 ```
-pip3 install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cu118
+pip3 install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu118
 conda install -y -c "nvidia/label/cuda-11.8.0" cuda-runtime
 ```
 
@@ -392,15 +392,18 @@ Run `python download-model.py --help` to see all the options.
 
 https://colab.research.google.com/github/oobabooga/text-generation-webui/blob/main/Colab-TextGen-GPU.ipynb
 
-## Contributing
+## Acknowledgment
 
-If you would like to contribute to the project, check out the [Contributing guidelines](https://github.com/oobabooga/text-generation-webui/wiki/Contributing-guidelines).
+In August 2023, [Andreessen Horowitz](https://a16z.com/) (a16z) provided a generous grant to encourage and support my independent work on this project. I am **extremely** grateful for their trust and recognition.
 
-## Community
+## Links
+
+#### Community
 
 * Subreddit: https://www.reddit.com/r/oobabooga/
 * Discord: https://discord.gg/jwZCF2dPQN
 
-## Acknowledgment
+#### Support
 
-In August 2023, [Andreessen Horowitz](https://a16z.com/) (a16z) provided a generous grant to encourage and support my independent work on this project. I am **extremely** grateful for their trust and recognition.
+* ko-fi: https://ko-fi.com/oobabooga
+* GitHub Sponsors: https://github.com/sponsors/oobabooga
diff --git a/css/html_instruct_style.css b/css/html_instruct_style.css
index 8891d180..8a31d6e2 100644
--- a/css/html_instruct_style.css
+++ b/css/html_instruct_style.css
@@ -49,7 +49,7 @@
 
 .gradio-container .chat .assistant-message {
     padding: 20px;
-    background: var(--color-grey-200);
+    background: #f4f4f4;
     margin-top: 9px !important;
     margin-bottom: 12px !important;
     border-radius: 7px;
@@ -62,8 +62,8 @@
 
 .gradio-container .chat .user-message {
     padding: 20px;
-    padding-left: 0px;
-    padding-right: 0px;
+    padding-left: 0;
+    padding-right: 0;
     background-color: transparent;
     border-radius: 8px;
     border-bottom-right-radius: 0;
diff --git a/css/main.css b/css/main.css
index a1ff2a36..5768348e 100644
--- a/css/main.css
+++ b/css/main.css
@@ -95,8 +95,8 @@ gradio-app > :first-child {
 }
 
 .header_bar {
-    background-color: #f7f7f7;
-    box-shadow: 0 0px 3px rgba(22 22 22 / 35%);
+    background-color: #f4f4f4;
+    box-shadow: 0 0 3px rgba(22 22 22 / 35%);
     margin-bottom: 0;
     overflow-x: scroll;
     margin-left: calc(-1 * var(--size-4));
@@ -221,6 +221,7 @@ button {
 
 .pretty_scrollbar::-webkit-scrollbar {
     width: 7px;
+    height: 7px;
 }
 
 .pretty_scrollbar::-webkit-scrollbar-track {
@@ -245,6 +246,10 @@ button {
     background: #374151;
 }
 
+.pretty_scrollbar::-webkit-scrollbar-corner {
+    background: transparent;
+}
+
 audio {
     max-width: 100%;
 }
@@ -331,6 +336,11 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
         padding-left: 0;
         padding-right: 0;
     }
+
+    .chat {
+        padding-left: 0;
+        padding-right: 0;
+    }
 }
 
 .chat {
@@ -386,7 +396,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 
 .chat .message:last-child {
     margin-bottom: 0 !important;
-    padding-bottom: 0 !important;
+    padding-bottom: 15px !important;
 }
 
 .message-body li {
@@ -433,12 +443,12 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 .message-body code {
     white-space: pre-wrap !important;
     word-wrap: break-word !important;
-    border: 1px solid #666666;
+    border: 1px solid #666;
     border-radius: 5px;
     font-size: 82%;
     padding: 1px 3px;
     background: #0d1117 !important;
-    color: rgb(201, 209, 217);
+    color: rgb(201 209 217);
 }
 
 .message-body pre > code {
@@ -505,7 +515,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 #show-controls {
     position: absolute;
     height: 100%;
-    background-color: var(--background-fill-primary);
+    background-color: transparent;
     border: 0 !important;
     border-radius: 0;
 }
@@ -695,7 +705,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 @media screen and (width >= 1327px) {
     #past-chats-row {
         position: absolute;
-        top: 16px;
+        top: 36px;
         left: 0;
         width: calc(0.5*(var(--document-width) - 880px - 120px - 16px*2));
         max-width: 300px;
@@ -743,3 +753,47 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
         display: none;
     }
 }
+
+#past-chats {
+    max-height: calc(100vh - 195px);
+    overflow-y: scroll !important;
+    border-radius: 0;
+    scrollbar-width: none; /* Hide scrollbar in Firefox by default */
+}
+
+#past-chats label {
+    width: 100%;
+    background-color: transparent !important;
+    background: none;
+    border: 0;
+    border-radius: 0;
+    padding-top: 8px;
+    padding-bottom: 8px;
+}
+
+#past-chats > :nth-child(2) {
+    display: none;
+}
+
+#past-chats > :nth-child(3) {
+    gap: 0;
+}
+
+#past-chats::-webkit-scrollbar {
+    display: none;
+}
+
+#past-chats:hover {
+    scrollbar-width: auto;
+}
+
+#past-chats:hover::-webkit-scrollbar {
+    display: block;
+}
+
+@media screen and (width < 1327px) {
+    #past-chats {
+        max-height: 300px;
+    }
+}
+
diff --git a/docker/TensorRT-LLM/Dockerfile b/docker/TensorRT-LLM/Dockerfile
new file mode 100644
index 00000000..ae503c94
--- /dev/null
+++ b/docker/TensorRT-LLM/Dockerfile
@@ -0,0 +1,27 @@
+FROM pytorch/pytorch:2.2.1-cuda12.1-cudnn8-runtime
+
+# Install Git
+RUN apt update && apt install -y git
+
+# System-wide TensorRT-LLM requirements
+RUN apt install -y openmpi-bin libopenmpi-dev
+
+# Set the working directory
+WORKDIR /app
+
+# Install text-generation-webui
+RUN git clone https://github.com/oobabooga/text-generation-webui
+WORKDIR /app/text-generation-webui
+RUN pip install -r requirements.txt
+
+# This is needed to avoid an error about "Failed to build mpi4py" in the next command
+ENV LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
+
+# Install TensorRT-LLM
+RUN pip3 install tensorrt_llm==0.10.0 -U --pre --extra-index-url https://pypi.nvidia.com
+
+# Expose the necessary port for the Python server
+EXPOSE 7860 5000
+
+# Run the Python server.py script with the specified command
+CMD ["python", "server.py", "--api", "--listen"]
diff --git a/docs/02 - Default and Notebook Tabs.md b/docs/02 - Default and Notebook Tabs.md
index c450635e..4bb78448 100644
--- a/docs/02 - Default and Notebook Tabs.md	
+++ b/docs/02 - Default and Notebook Tabs.md	
@@ -18,13 +18,13 @@ In the **Prompt** menu, you can select from some predefined prompts defined unde
 
 ### Output
 
-Four tabs can be found:
+Five tabs can be found:
 
 * **Raw**: where the raw text generated by the model appears.
 * **Markdown**: it contains a "Render" button. You can click on it at any time to render the current output as markdown. This is particularly useful for models that generate LaTeX equations like GALACTICA.
 * **HTML**: displays the output in an HTML style that is meant to be easier to read. Its style is defined under `text-generation-webui/css/html_readable_style.css`.
 * **Logits**: when you click on "Get next token probabilities", this tab displays the 50 most likely next tokens and their probabilities based on your current input. If "Use samplers" is checked, the probabilities will be the ones after the sampling parameters in the "Parameters" > "Generation" tab are applied. Otherwise, they will be the raw probabilities generated by the model.
-* **Tokens**: allows you to tokenize your prompt and see the ID numbers for the individuals tokens.
+* **Tokens**: allows you to tokenize your prompt and see the ID numbers for the individual tokens.
 
 ## Notebook tab
 
diff --git a/docs/12 - OpenAI API.md b/docs/12 - OpenAI API.md
index fee57d33..b00a1f34 100644
--- a/docs/12 - OpenAI API.md	
+++ b/docs/12 - OpenAI API.md	
@@ -219,7 +219,7 @@ print()
 
 ### Environment variables
 
-The following environment variables can be used (they take precendence over everything else):
+The following environment variables can be used (they take precedence over everything else):
 
 | Variable Name          | Description                                                                                        | Example Value              |
 |------------------------|------------------------------------|----------------------------|
diff --git a/docs/README.md b/docs/README.md
index d2efbf1d..666ee85c 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -1,4 +1,4 @@
-These files is a mirror of the documentation at:
+These files are a mirror of the documentation at:
 
 # https://github.com/oobabooga/text-generation-webui/wiki
 
diff --git a/extensions/sd_api_pictures/script.py b/extensions/sd_api_pictures/script.py
index a16575cd..3a31771a 100644
--- a/extensions/sd_api_pictures/script.py
+++ b/extensions/sd_api_pictures/script.py
@@ -33,7 +33,7 @@ params = {
     'hr_upscaler': 'ESRGAN_4x',
     'hr_scale': '1.0',
     'seed': -1,
-    'sampler_name': 'DPM++ 2M Karras',
+    'sampler_name': 'DPM++ 2M',
     'steps': 32,
     'cfg_scale': 7,
     'textgen_prefix': 'Please provide a detailed and vivid description of [subject]',
diff --git a/extensions/whisper_stt/script.js b/extensions/whisper_stt/script.js
new file mode 100644
index 00000000..c4a908b5
--- /dev/null
+++ b/extensions/whisper_stt/script.js
@@ -0,0 +1,86 @@
+console.log("Whisper STT script loaded");
+
+let mediaRecorder;
+let audioChunks = [];
+let isRecording = false;
+
+window.startStopRecording = function() {
+  if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
+    console.error("getUserMedia not supported on your browser!");
+    return;
+  }
+
+  if (isRecording == false) {
+    //console.log("Start recording function called");
+    navigator.mediaDevices.getUserMedia({ audio: true })
+      .then(stream => {
+        //console.log("Got audio stream");
+        mediaRecorder = new MediaRecorder(stream);
+        audioChunks = []; // Reset audio chunks
+        mediaRecorder.start();
+        //console.log("MediaRecorder started");
+        recButton.icon;
+        recordButton.innerHTML = recButton.innerHTML = "Stop";
+        isRecording = true;
+
+        mediaRecorder.addEventListener("dataavailable", event => {
+          //console.log("Data available event, data size: ", event.data.size);
+          audioChunks.push(event.data);
+        });
+                
+        mediaRecorder.addEventListener("stop", () => {
+          //console.log("MediaRecorder stopped");
+          if (audioChunks.length > 0) {
+            const audioBlob = new Blob(audioChunks, { type: "audio/webm" });
+            //console.log("Audio blob created, size: ", audioBlob.size);
+            const reader = new FileReader();
+            reader.readAsDataURL(audioBlob);
+            reader.onloadend = function() {
+              const base64data = reader.result;
+              //console.log("Audio converted to base64, length: ", base64data.length);
+                            
+              const audioBase64Input = document.querySelector("#audio-base64 textarea");
+              if (audioBase64Input) {
+                audioBase64Input.value = base64data;
+                audioBase64Input.dispatchEvent(new Event("input", { bubbles: true }));
+                audioBase64Input.dispatchEvent(new Event("change", { bubbles: true }));
+                //console.log("Updated textarea with base64 data");
+              } else {
+                console.error("Could not find audio-base64 textarea");
+              }
+            };
+          } else {
+            console.error("No audio data recorded for Whisper");
+          }
+        });
+      });
+  } else {
+    //console.log("Stopping MediaRecorder");
+    recordButton.innerHTML = recButton.innerHTML = "Rec.";
+    isRecording = false;
+    mediaRecorder.stop();
+  }
+};
+
+const recordButton = gradioApp().querySelector("#record-button");
+recordButton.addEventListener("click", window.startStopRecording);
+
+
+function gradioApp() {
+  const elems = document.getElementsByTagName("gradio-app");
+  const gradioShadowRoot = elems.length == 0 ? null : elems[0].shadowRoot;
+  return gradioShadowRoot ? gradioShadowRoot : document;
+}
+
+
+// extra rec button next to generate button
+var recButton = recordButton.cloneNode(true);
+var generate_button = document.getElementById("Generate");
+generate_button.insertAdjacentElement("afterend", recButton);
+
+recButton.style.setProperty("margin-left", "-10px");
+recButton.innerHTML = "Rec.";
+
+recButton.addEventListener("click", function() {
+  recordButton.click();
+});
diff --git a/extensions/whisper_stt/script.py b/extensions/whisper_stt/script.py
index efc62f41..e45c8b1e 100644
--- a/extensions/whisper_stt/script.py
+++ b/extensions/whisper_stt/script.py
@@ -1,5 +1,13 @@
+import base64
+import gc
+import io
+from pathlib import Path
+
 import gradio as gr
-import speech_recognition as sr
+import numpy as np
+import torch
+import whisper
+from pydub import AudioSegment
 
 from modules import shared
 
@@ -8,13 +16,16 @@ input_hijack = {
     'value': ["", ""]
 }
 
-# parameters which can be customized in settings.json of webui
+# parameters which can be customized in settings.yaml of webui
 params = {
     'whipser_language': 'english',
     'whipser_model': 'small.en',
     'auto_submit': True
 }
 
+startup_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+WHISPERMODEL = whisper.load_model(params['whipser_model'], device=startup_device)
+
 
 def chat_input_modifier(text, visible_text, state):
     global input_hijack
@@ -25,47 +36,84 @@ def chat_input_modifier(text, visible_text, state):
         return text, visible_text
 
 
-def do_stt(audio, whipser_model, whipser_language):
-    transcription = ""
-    r = sr.Recognizer()
+def do_stt(audio, whipser_language):
+    # use pydub to convert sample_rate and sample_width for whisper input
+    dubaudio = AudioSegment.from_file(io.BytesIO(audio))
+    dubaudio = dubaudio.set_channels(1)
+    dubaudio = dubaudio.set_frame_rate(16000)
+    dubaudio = dubaudio.set_sample_width(2)
 
-    # Convert to AudioData
-    audio_data = sr.AudioData(sample_rate=audio[0], frame_data=audio[1], sample_width=4)
+    # same method to get the array as openai whisper repo used from wav file
+    audio_np = np.frombuffer(dubaudio.raw_data, np.int16).flatten().astype(np.float32) / 32768.0
 
-    try:
-        transcription = r.recognize_whisper(audio_data, language=whipser_language, model=whipser_model)
-    except sr.UnknownValueError:
-        print("Whisper could not understand audio")
-    except sr.RequestError as e:
-        print("Could not request results from Whisper", e)
+    if len(whipser_language) == 0:
+        result = WHISPERMODEL.transcribe(audio=audio_np)
+    else:
+        result = WHISPERMODEL.transcribe(audio=audio_np, language=whipser_language)
+    return result["text"]
 
+
+def auto_transcribe(audio, auto_submit, whipser_language):
+    if audio is None or audio == "":
+        print("Whisper received no audio data")
+        return "", ""
+    audio_bytes = base64.b64decode(audio.split(',')[1])
+
+    transcription = do_stt(audio_bytes, whipser_language)
+    if auto_submit:
+        input_hijack.update({"state": True, "value": [transcription, transcription]})
     return transcription
 
 
-def auto_transcribe(audio, auto_submit, whipser_model, whipser_language):
-    if audio is None:
-        return "", ""
-    transcription = do_stt(audio, whipser_model, whipser_language)
-    if auto_submit:
-        input_hijack.update({"state": True, "value": [transcription, transcription]})
+def reload_whispermodel(whisper_model_name: str, whisper_language: str, device: str):
+    if len(whisper_model_name) > 0:
+        global WHISPERMODEL
+        WHISPERMODEL = None
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        gc.collect()
 
-    return transcription, None
+        if device != "none":
+            if device == "cuda":
+                device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+            WHISPERMODEL = whisper.load_model(whisper_model_name, device=device)
+            params.update({"whipser_model": whisper_model_name})
+            if ".en" in whisper_model_name:
+                whisper_language = "english"
+            audio_update = gr.Audio.update(interactive=True)
+        else:
+            audio_update = gr.Audio.update(interactive=False)
+        return [whisper_model_name, whisper_language, str(device), audio_update]
 
 
 def ui():
     with gr.Accordion("Whisper STT", open=True):
         with gr.Row():
-            audio = gr.Audio(source="microphone")
+            audio = gr.Textbox(elem_id="audio-base64", visible=False)
+            record_button = gr.Button("Rec.", elem_id="record-button", elem_classes="custom-button")
         with gr.Row():
             with gr.Accordion("Settings", open=False):
                 auto_submit = gr.Checkbox(label='Submit the transcribed audio automatically', value=params['auto_submit'])
-                whipser_model = gr.Dropdown(label='Whisper Model', value=params['whipser_model'], choices=["tiny.en", "base.en", "small.en", "medium.en", "tiny", "base", "small", "medium", "large"])
-                whipser_language = gr.Dropdown(label='Whisper Language', value=params['whipser_language'], choices=["chinese", "german", "spanish", "russian", "korean", "french", "japanese", "portuguese", "turkish", "polish", "catalan", "dutch", "arabic", "swedish", "italian", "indonesian", "hindi", "finnish", "vietnamese", "hebrew", "ukrainian", "greek", "malay", "czech", "romanian", "danish", "hungarian", "tamil", "norwegian", "thai", "urdu", "croatian", "bulgarian", "lithuanian", "latin", "maori", "malayalam", "welsh", "slovak", "telugu", "persian", "latvian", "bengali", "serbian", "azerbaijani", "slovenian", "kannada", "estonian", "macedonian", "breton", "basque", "icelandic", "armenian", "nepali", "mongolian", "bosnian", "kazakh", "albanian", "swahili", "galician", "marathi", "punjabi", "sinhala", "khmer", "shona", "yoruba", "somali", "afrikaans", "occitan", "georgian", "belarusian", "tajik", "sindhi", "gujarati", "amharic", "yiddish", "lao", "uzbek", "faroese", "haitian creole", "pashto", "turkmen", "nynorsk", "maltese", "sanskrit", "luxembourgish", "myanmar", "tibetan", "tagalog", "malagasy", "assamese", "tatar", "hawaiian", "lingala", "hausa", "bashkir", "javanese", "sundanese"])
+                device_dropd = gr.Dropdown(label='Device', value=str(startup_device), choices=["cuda", "cpu", "none"])
+                whisper_model_dropd = gr.Dropdown(label='Whisper Model', value=params['whipser_model'], choices=["tiny.en", "base.en", "small.en", "medium.en", "tiny", "base", "small", "medium", "large"])
+                whisper_language = gr.Dropdown(label='Whisper Language', value=params['whipser_language'], choices=["english", "chinese", "german", "spanish", "russian", "korean", "french", "japanese", "portuguese", "turkish", "polish", "catalan", "dutch", "arabic", "swedish", "italian", "indonesian", "hindi", "finnish", "vietnamese", "hebrew", "ukrainian", "greek", "malay", "czech", "romanian", "danish", "hungarian", "tamil", "norwegian", "thai", "urdu", "croatian", "bulgarian", "lithuanian", "latin", "maori", "malayalam", "welsh", "slovak", "telugu", "persian", "latvian", "bengali", "serbian", "azerbaijani", "slovenian", "kannada", "estonian", "macedonian", "breton", "basque", "icelandic", "armenian", "nepali", "mongolian", "bosnian", "kazakh", "albanian", "swahili", "galician", "marathi", "punjabi", "sinhala", "khmer", "shona", "yoruba", "somali", "afrikaans", "occitan", "georgian", "belarusian", "tajik", "sindhi", "gujarati", "amharic", "yiddish", "lao", "uzbek", "faroese", "haitian creole", "pashto", "turkmen", "nynorsk", "maltese", "sanskrit", "luxembourgish", "myanmar", "tibetan", "tagalog", "malagasy", "assamese", "tatar", "hawaiian", "lingala", "hausa", "bashkir", "javanese", "sundanese"])
 
-    audio.stop_recording(
-        auto_transcribe, [audio, auto_submit, whipser_model, whipser_language], [shared.gradio['textbox'], audio]).then(
-        None, auto_submit, None, js="(check) => {if (check) { document.getElementById('Generate').click() }}")
+    audio.change(
+        auto_transcribe, [audio, auto_submit, whisper_language], [shared.gradio['textbox']]).then(
+        None, auto_submit, None, _js="(check) => {if (check) { document.getElementById('Generate').click() }}")
 
-    whipser_model.change(lambda x: params.update({"whipser_model": x}), whipser_model, None)
-    whipser_language.change(lambda x: params.update({"whipser_language": x}), whipser_language, None)
+    device_dropd.input(reload_whispermodel, [whisper_model_dropd, whisper_language, device_dropd], [whisper_model_dropd, whisper_language, device_dropd, audio])
+    whisper_model_dropd.change(reload_whispermodel, [whisper_model_dropd, whisper_language, device_dropd], [whisper_model_dropd, whisper_language, device_dropd, audio])
+    whisper_language.change(lambda x: params.update({"whipser_language": x}), whisper_language, None)
     auto_submit.change(lambda x: params.update({"auto_submit": x}), auto_submit, None)
+
+
+def custom_js():
+    """
+    Returns custom javascript as a string. It is applied whenever the web UI is
+    loaded.
+    :return:
+    """
+    with open(Path(__file__).parent.resolve() / "script.js", "r") as f:
+        return f.read()
diff --git a/js/main.js b/js/main.js
index 72568e1b..6b456517 100644
--- a/js/main.js
+++ b/js/main.js
@@ -7,30 +7,30 @@ main_parent.parentNode.style = "gap: 0";
 main_parent.parentNode.parentNode.style = "padding: 0";
 
 document.querySelector(".header_bar").addEventListener("click", function(event) {
-  if (event.target.tagName === "BUTTON") {
-    const buttonText = event.target.textContent.trim();
+  if (event.target.tagName !== "BUTTON") return;
 
-    let chat_visible = (buttonText == "Chat");
-    let default_visible = (buttonText == "Default");
-    let notebook_visible = (buttonText == "Notebook");
+  const buttonText = event.target.textContent.trim();
+  const extensionsVisible = ["Chat", "Default", "Notebook"].includes(buttonText);
+  const chatVisible = buttonText === "Chat";
+  const showControlsChecked = document.querySelector("#show-controls input").checked;
+  const extensions = document.querySelector("#extensions");
 
-    // Check if one of the generation tabs is visible
-    if (chat_visible || notebook_visible || default_visible) {
-      extensions && (extensions.style.display = "flex");
-
-      if (chat_visible) {
-        this.style.marginBottom = "0px";
-        extensions && (extensions.style.maxWidth = "880px");
-        extensions && (extensions.style.padding = "0px");
-      } else {
-        this.style.marginBottom = "19px";
-        extensions && (extensions.style.maxWidth = "none");
-        extensions && (extensions.style.padding = "15px");
-      }
-    } else {
-      this.style.marginBottom = "19px";
-      extensions && (extensions.style.display = "none");
+  if (extensionsVisible) {
+    if (extensions) {
+      extensions.style.display = "flex";
+      extensions.style.maxWidth = chatVisible ? "880px" : "none";
+      extensions.style.padding = chatVisible ? "0px" : "15px";
     }
+    this.style.marginBottom = chatVisible ? "0px" : "19px";
+
+    if (chatVisible && !showControlsChecked) {
+      document.querySelectorAll("#chat-tab > div > :nth-child(n+2), #extensions").forEach(element => {
+        element.style.display = "none";
+      });
+    }
+  } else {
+    this.style.marginBottom = "19px";
+    if (extensions) extensions.style.display = "none";
   }
 });
 
@@ -98,20 +98,6 @@ document.addEventListener("keydown", function(event) {
     document.getElementById("Impersonate").click();
   }
 
-  // Switch between tabs on Tab
-  else if (!event.ctrlKey && !event.shiftKey && !event.altKey && !event.metaKey && event.key === "Tab") {
-    event.preventDefault();
-    var parametersButton = document.getElementById("parameters-button");
-    var parentContainer = parametersButton.parentNode;
-    var selectedChild = parentContainer.querySelector(".selected");
-
-    if (selectedChild.id == "parameters-button") {
-      document.getElementById(previousTabId).click();
-    } else {
-      previousTabId = selectedChild.id;
-      parametersButton.click();
-    }
-  }
 });
 
 //------------------------------------------------
@@ -548,3 +534,69 @@ document.querySelectorAll(".focus-on-chat-input").forEach(element => {
     document.querySelector("#chat-input textarea").focus();
   });
 });
+
+//------------------------------------------------
+// Fix a border around the "past chats" menu
+//------------------------------------------------
+document.getElementById("past-chats").parentNode.style.borderRadius = "0px";
+
+//------------------------------------------------
+// Allow the character dropdown to coexist at the
+// Chat tab and the Parameters > Character tab
+//------------------------------------------------
+
+const headerBar = document.querySelector(".header_bar");
+let originalParent;
+let originalIndex; // To keep track of the original position
+let movedElement;
+
+function moveToChatTab() {
+  const characterMenu = document.getElementById("character-menu");
+  const grandParent = characterMenu.parentElement.parentElement;
+
+  // Save the initial location for the character dropdown
+  if (!originalParent) {
+    originalParent = grandParent.parentElement;
+    originalIndex = Array.from(originalParent.children).indexOf(grandParent);
+    movedElement = grandParent;
+  }
+
+  // Do not show the Character dropdown in the Chat tab when "instruct" mode is selected
+  const instructRadio = document.querySelector("#chat-mode input[value=\"instruct\"]");
+  if (instructRadio && instructRadio.checked) {
+    grandParent.style.display = "none";
+  }
+
+  const chatControlsFirstChild = document.querySelector("#chat-controls").firstElementChild;
+  const newParent = chatControlsFirstChild;
+  let newPosition = newParent.children.length - 2;
+
+  newParent.insertBefore(grandParent, newParent.children[newPosition]);
+  document.getElementById("save-character").style.display = "none";
+}
+
+function restoreOriginalPosition() {
+  if (originalParent && movedElement) {
+    if (originalIndex >= originalParent.children.length) {
+      originalParent.appendChild(movedElement);
+    } else {
+      originalParent.insertBefore(movedElement, originalParent.children[originalIndex]);
+    }
+
+    document.getElementById("save-character").style.display = "";
+    movedElement.style.display = "";
+  }
+}
+
+headerBar.addEventListener("click", (e) => {
+  if (e.target.tagName === "BUTTON") {
+    const tabName = e.target.textContent.trim();
+    if (tabName === "Chat") {
+      moveToChatTab();
+    } else {
+      restoreOriginalPosition();
+    }
+  }
+});
+
+moveToChatTab();
diff --git a/modules/LoRA.py b/modules/LoRA.py
index 15132f4e..eda5e406 100644
--- a/modules/LoRA.py
+++ b/modules/LoRA.py
@@ -73,7 +73,7 @@ def add_lora_autogptq(lora_names):
         if len(lora_names) > 1:
             logger.warning('AutoGPTQ can only work with 1 LoRA at the moment. Only the first one in the list will be loaded.')
         if not shared.args.no_inject_fused_attention:
-            logger.warning('Fused Atttention + AutoGPTQ may break Lora loading. Disable it.')
+            logger.warning('Fused Attention + AutoGPTQ may break Lora loading. Disable it.')
 
         peft_config = GPTQLoraConfig(
             inference_mode=True,
diff --git a/modules/RoPE.py b/modules/RoPE.py
deleted file mode 100644
index 31163a33..00000000
--- a/modules/RoPE.py
+++ /dev/null
@@ -1,18 +0,0 @@
-def get_alpha_value(alpha, base):
-    '''
-    Gets alpha_value from alpha_value and rope_freq_base
-    '''
-    if base > 0:
-        return (base / 10000.) ** (63 / 64.)
-    else:
-        return alpha
-
-
-def get_rope_freq_base(alpha, base):
-    '''
-    Gets rope_freq_base from alpha_value and rope_freq_base
-    '''
-    if base > 0:
-        return base
-    else:
-        return 10000 * alpha ** (64 / 63.)
diff --git a/modules/block_requests.py b/modules/block_requests.py
index 0eb10fa2..778b9f5a 100644
--- a/modules/block_requests.py
+++ b/modules/block_requests.py
@@ -43,19 +43,27 @@ def my_open(*args, **kwargs):
         with original_open(*args, **kwargs) as f:
             file_contents = f.read()
 
-        file_contents = file_contents.replace(b'\t\t<script\n\t\t\tsrc="https://cdnjs.cloudflare.com/ajax/libs/iframe-resizer/4.3.9/iframeResizer.contentWindow.min.js"\n\t\t\tasync\n\t\t></script>', b'')
-        file_contents = file_contents.replace(b'cdnjs.cloudflare.com', b'127.0.0.1')
+        if len(args) > 1 and args[1] == 'rb':
+            file_contents = file_contents.decode('utf-8')
+
+        file_contents = file_contents.replace('\t\t<script\n\t\t\tsrc="https://cdnjs.cloudflare.com/ajax/libs/iframe-resizer/4.3.9/iframeResizer.contentWindow.min.js"\n\t\t\tasync\n\t\t></script>', '')
+        file_contents = file_contents.replace('cdnjs.cloudflare.com', '127.0.0.1')
         file_contents = file_contents.replace(
-            b'</head>',
-            b'\n    <script src="file/js/katex/katex.min.js"></script>'
-            b'\n    <script src="file/js/katex/auto-render.min.js"></script>'
-            b'\n    <script src="file/js/highlightjs/highlight.min.js"></script>'
-            b'\n    <script src="file/js/highlightjs/highlightjs-copy.min.js"></script>'
-            b'\n    <script>hljs.addPlugin(new CopyButtonPlugin());</script>'
-            b'\n  </head>'
+            '</head>',
+            '\n    <script src="file/js/katex/katex.min.js"></script>'
+            '\n    <script src="file/js/katex/auto-render.min.js"></script>'
+            '\n    <script src="file/js/highlightjs/highlight.min.js"></script>'
+            '\n    <script src="file/js/highlightjs/highlightjs-copy.min.js"></script>'
+            '\n    <script>hljs.addPlugin(new CopyButtonPlugin());</script>'
+            '\n  </head>'
         )
 
-        return io.BytesIO(file_contents)
+        if len(args) > 1 and args[1] == 'rb':
+            file_contents = file_contents.encode('utf-8')
+            return io.BytesIO(file_contents)
+        else:
+            return io.StringIO(file_contents)
+
     else:
         return original_open(*args, **kwargs)
 
diff --git a/modules/chat.py b/modules/chat.py
index 43f5466b..6640776f 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -3,6 +3,7 @@ import copy
 import functools
 import html
 import json
+import pprint
 import re
 from datetime import datetime
 from functools import partial
@@ -259,10 +260,27 @@ def get_stopping_strings(state):
             suffix_bot + prefix_user,
         ]
 
+    # Try to find the EOT token
+    for item in stopping_strings.copy():
+        item = item.strip()
+        if item.startswith("<") and ">" in item:
+            stopping_strings.append(item.split(">")[0] + ">")
+        elif item.startswith("[") and "]" in item:
+            stopping_strings.append(item.split("]")[0] + "]")
+
     if 'stopping_strings' in state and isinstance(state['stopping_strings'], list):
         stopping_strings += state.pop('stopping_strings')
 
-    return list(set(stopping_strings))
+    # Remove redundant items that start with another item
+    result = [item for item in stopping_strings if not any(item.startswith(other) and item != other for other in stopping_strings)]
+    result = list(set(result))
+
+    if shared.args.verbose:
+        logger.info("STOPPING_STRINGS=")
+        pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(result)
+        print()
+
+    return result
 
 
 def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_message=True, for_ui=False):
@@ -492,7 +510,7 @@ def save_history(history, unique_id, character, mode):
         p.parent.mkdir(parents=True)
 
     with open(p, 'w', encoding='utf-8') as f:
-        f.write(json.dumps(history, indent=4))
+        f.write(json.dumps(history, indent=4, ensure_ascii=False))
 
 
 def rename_history(old_id, new_id, character, mode):
@@ -505,17 +523,16 @@ def rename_history(old_id, new_id, character, mode):
         logger.error(f"The following path is not allowed: \"{new_p}\".")
     elif new_p == old_p:
         logger.info("The provided path is identical to the old one.")
+    elif new_p.exists():
+        logger.error(f"The new path already exists and will not be overwritten: \"{new_p}\".")
     else:
         logger.info(f"Renaming \"{old_p}\" to \"{new_p}\"")
         old_p.rename(new_p)
 
 
-def find_all_histories(state):
-    if shared.args.multi_user:
-        return ['']
-
+def get_paths(state):
     if state['mode'] == 'instruct':
-        paths = Path('logs/instruct').glob('*.json')
+        return Path('logs/instruct').glob('*.json')
     else:
         character = state['character_menu']
 
@@ -533,12 +550,55 @@ def find_all_histories(state):
             p.parent.mkdir(exist_ok=True)
             new_p.rename(p)
 
-        paths = Path(f'logs/chat/{character}').glob('*.json')
+        return Path(f'logs/chat/{character}').glob('*.json')
 
+
+def find_all_histories(state):
+    if shared.args.multi_user:
+        return ['']
+
+    paths = get_paths(state)
     histories = sorted(paths, key=lambda x: x.stat().st_mtime, reverse=True)
-    histories = [path.stem for path in histories]
+    return [path.stem for path in histories]
 
-    return histories
+
+def find_all_histories_with_first_prompts(state):
+    if shared.args.multi_user:
+        return []
+
+    paths = get_paths(state)
+    histories = sorted(paths, key=lambda x: x.stat().st_mtime, reverse=True)
+
+    result = []
+    for i, path in enumerate(histories):
+        filename = path.stem
+        if re.match(r'^[0-9]{8}-[0-9]{2}-[0-9]{2}-[0-9]{2}$', filename):
+            with open(path, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+
+                first_prompt = ""
+                if data and 'visible' in data and len(data['visible']) > 0:
+                    if data['internal'][0][0] == '<|BEGIN-VISIBLE-CHAT|>':
+                        if len(data['visible']) > 1:
+                            first_prompt = html.unescape(data['visible'][1][0])
+                        elif i == 0:
+                            first_prompt = "New chat"
+                    else:
+                        first_prompt = html.unescape(data['visible'][0][0])
+                elif i == 0:
+                    first_prompt = "New chat"
+        else:
+            first_prompt = filename
+
+        first_prompt = first_prompt.strip()
+
+        # Truncate the first prompt if it's longer than 32 characters
+        if len(first_prompt) > 32:
+            first_prompt = first_prompt[:29] + '...'
+
+        result.append((first_prompt, filename))
+
+    return result
 
 
 def load_latest_history(state):
@@ -569,17 +629,17 @@ def load_history_after_deletion(state, idx):
     if shared.args.multi_user:
         return start_new_chat(state)
 
-    histories = find_all_histories(state)
+    histories = find_all_histories_with_first_prompts(state)
     idx = min(int(idx), len(histories) - 1)
     idx = max(0, idx)
 
     if len(histories) > 0:
-        history = load_history(histories[idx], state['character_menu'], state['mode'])
+        history = load_history(histories[idx][1], state['character_menu'], state['mode'])
     else:
         history = start_new_chat(state)
-        histories = find_all_histories(state)
+        histories = find_all_histories_with_first_prompts(state)
 
-    return history, gr.update(choices=histories, value=histories[idx])
+    return history, gr.update(choices=histories, value=histories[idx][1])
 
 
 def update_character_menu_after_deletion(idx):
diff --git a/modules/exllamav2.py b/modules/exllamav2.py
index d39bc315..a770e342 100644
--- a/modules/exllamav2.py
+++ b/modules/exllamav2.py
@@ -48,6 +48,8 @@ class Exllamav2Model:
         config.scale_pos_emb = shared.args.compress_pos_emb
         config.scale_alpha_value = shared.args.alpha_value
         config.no_flash_attn = shared.args.no_flash_attn
+        config.no_xformers = shared.args.no_xformers
+        config.no_sdpa = shared.args.no_sdpa
         config.num_experts_per_token = int(shared.args.num_experts_per_token)
 
         model = ExLlamaV2(config)
diff --git a/modules/exllamav2_hf.py b/modules/exllamav2_hf.py
index 9ab9cdc7..53143d9a 100644
--- a/modules/exllamav2_hf.py
+++ b/modules/exllamav2_hf.py
@@ -176,6 +176,8 @@ class Exllamav2HF(PreTrainedModel):
         config.scale_pos_emb = shared.args.compress_pos_emb
         config.scale_alpha_value = shared.args.alpha_value
         config.no_flash_attn = shared.args.no_flash_attn
+        config.no_xformers = shared.args.no_xformers
+        config.no_sdpa = shared.args.no_sdpa
         config.num_experts_per_token = int(shared.args.num_experts_per_token)
 
         return Exllamav2HF(config)
diff --git a/modules/github.py b/modules/github.py
index 282267b6..f3dc26e1 100644
--- a/modules/github.py
+++ b/modules/github.py
@@ -32,7 +32,7 @@ def clone_or_pull_repository(github_url):
         yield f"Cloning {github_url}..."
         clone_output = subprocess.check_output(["git", "clone", github_url, repo_path], stderr=subprocess.STDOUT)
         new_extensions.add(repo_name)
-        yield f"The extension `{repo_name}` has been downloaded.\n\nPlease close the the web UI completely and launch it again to be able to load it."
+        yield f"The extension `{repo_name}` has been downloaded.\n\nPlease close the web UI completely and launch it again to be able to load it."
         return clone_output.decode()
     except subprocess.CalledProcessError as e:
         return str(e)
diff --git a/modules/html_generator.py b/modules/html_generator.py
index 7e50f561..657133bd 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -85,15 +85,20 @@ def convert_to_markdown(string):
 
     # Unfinished list, like "\n1.". A |delete| string is added and then
     # removed to force a <ol> or <ul> to be generated instead of a <p>.
-    if re.search(r'(\n\d+\.?|\n\*\s*)$', result):
+    list_item_pattern = r'(\n\d+\.?|\n\s*[-*+]\s*([*_~]{1,3})?)$'
+    if re.search(list_item_pattern, result):
         delete_str = '|delete|'
 
         if re.search(r'(\d+\.?)$', result) and not result.endswith('.'):
             result += '.'
 
-        result = re.sub(r'(\n\d+\.?|\n\*\s*)$', r'\g<1> ' + delete_str, result)
+        # Add the delete string after the list item
+        result = re.sub(list_item_pattern, r'\g<1> ' + delete_str, result)
 
+        # Convert to HTML using markdown
         html_output = markdown.markdown(result, extensions=['fenced_code', 'tables'])
+
+        # Remove the delete string from the HTML output
         pos = html_output.rfind(delete_str)
         if pos > -1:
             html_output = html_output[:pos] + html_output[pos + len(delete_str):]
diff --git a/modules/llama_cpp_python_hijack.py b/modules/llama_cpp_python_hijack.py
index eb23177f..64280dc9 100644
--- a/modules/llama_cpp_python_hijack.py
+++ b/modules/llama_cpp_python_hijack.py
@@ -1,3 +1,5 @@
+import importlib
+import platform
 from typing import Sequence
 
 from tqdm import tqdm
@@ -5,20 +7,46 @@ from tqdm import tqdm
 from modules import shared
 from modules.cache_utils import process_llamacpp_cache
 
-try:
-    import llama_cpp
-except:
-    llama_cpp = None
 
-try:
-    import llama_cpp_cuda
-except:
-    llama_cpp_cuda = None
+imported_module = None
 
-try:
-    import llama_cpp_cuda_tensorcores
-except:
-    llama_cpp_cuda_tensorcores = None
+
+def llama_cpp_lib():
+    global imported_module
+
+    # Determine the platform
+    is_macos = platform.system() == 'Darwin'
+
+    # Define the library names based on the platform
+    if is_macos:
+        lib_names = [
+            (None, 'llama_cpp')
+        ]
+    else:
+        lib_names = [
+            ('cpu', 'llama_cpp'),
+            ('tensorcores', 'llama_cpp_cuda_tensorcores'),
+            (None, 'llama_cpp_cuda'),
+            (None, 'llama_cpp')
+        ]
+
+    for arg, lib_name in lib_names:
+        should_import = (arg is None or getattr(shared.args, arg))
+
+        if should_import:
+            if imported_module and imported_module != lib_name:
+                # Conflict detected, raise an exception
+                raise Exception(f"Cannot import `{lib_name}` because `{imported_module}` is already imported. Switching to a different version of llama-cpp-python currently requires a server restart.")
+
+            try:
+                return_lib = importlib.import_module(lib_name)
+                imported_module = lib_name
+                monkey_patch_llama_cpp_python(return_lib)
+                return return_lib
+            except ImportError:
+                continue
+
+    return None
 
 
 def eval_with_progress(self, tokens: Sequence[int]):
@@ -63,10 +91,12 @@ def eval_with_progress(self, tokens: Sequence[int]):
         self.n_tokens += n_tokens
 
 
-def monkey_patch_generate(lib):
+def monkey_patch_llama_cpp_python(lib):
+    if getattr(lib.Llama, '_is_patched', False):
+        # If the patch is already applied, do nothing
+        return
 
     def my_generate(self, *args, **kwargs):
-
         if shared.args.streaming_llm:
             new_sequence = args[0]
             past_sequence = self._input_ids
@@ -77,11 +107,9 @@ def monkey_patch_generate(lib):
         for output in self.original_generate(*args, **kwargs):
             yield output
 
+    lib.Llama.eval = eval_with_progress
     lib.Llama.original_generate = lib.Llama.generate
     lib.Llama.generate = my_generate
 
-
-for lib in [llama_cpp, llama_cpp_cuda, llama_cpp_cuda_tensorcores]:
-    if lib is not None:
-        lib.Llama.eval = eval_with_progress
-        monkey_patch_generate(lib)
+    # Set the flag to indicate that the patch has been applied
+    lib.Llama._is_patched = True
diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py
index f30be66a..327e3a7b 100644
--- a/modules/llamacpp_hf.py
+++ b/modules/llamacpp_hf.py
@@ -7,35 +7,10 @@ from torch.nn import CrossEntropyLoss
 from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel
 from transformers.modeling_outputs import CausalLMOutputWithPast
 
-from modules import RoPE, llama_cpp_python_hijack, shared
+from modules import shared
+from modules.llama_cpp_python_hijack import llama_cpp_lib
 from modules.logging_colors import logger
 
-try:
-    import llama_cpp
-except:
-    llama_cpp = None
-
-try:
-    import llama_cpp_cuda
-except:
-    llama_cpp_cuda = None
-
-try:
-    import llama_cpp_cuda_tensorcores
-except:
-    llama_cpp_cuda_tensorcores = None
-
-
-def llama_cpp_lib():
-    if shared.args.cpu and llama_cpp is not None:
-        return llama_cpp
-    elif shared.args.tensorcores and llama_cpp_cuda_tensorcores is not None:
-        return llama_cpp_cuda_tensorcores
-    elif llama_cpp_cuda is not None:
-        return llama_cpp_cuda
-    else:
-        return llama_cpp
-
 
 class LlamacppHF(PreTrainedModel):
     def __init__(self, model, path):
@@ -212,7 +187,7 @@ class LlamacppHF(PreTrainedModel):
             'mul_mat_q': not shared.args.no_mul_mat_q,
             'numa': shared.args.numa,
             'n_gpu_layers': shared.args.n_gpu_layers,
-            'rope_freq_base': RoPE.get_rope_freq_base(shared.args.alpha_value, shared.args.rope_freq_base),
+            'rope_freq_base': shared.args.rope_freq_base,
             'tensor_split': tensor_split_list,
             'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
             'logits_all': shared.args.logits_all,
@@ -221,6 +196,13 @@ class LlamacppHF(PreTrainedModel):
             'flash_attn': shared.args.flash_attn
         }
 
+        if shared.args.cache_4bit:
+            params["type_k"] = 2
+            params["type_v"] = 2
+        elif shared.args.cache_8bit:
+            params["type_k"] = 8
+            params["type_v"] = 8
+
         Llama = llama_cpp_lib().Llama
         model = Llama(**params)
 
diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py
index b2a25d36..a16230ca 100644
--- a/modules/llamacpp_model.py
+++ b/modules/llamacpp_model.py
@@ -4,37 +4,12 @@ from functools import partial
 import numpy as np
 import torch
 
-from modules import RoPE, llama_cpp_python_hijack, shared
+from modules import shared
 from modules.callbacks import Iteratorize
+from modules.llama_cpp_python_hijack import llama_cpp_lib
 from modules.logging_colors import logger
 from modules.text_generation import get_max_prompt_length
 
-try:
-    import llama_cpp
-except:
-    llama_cpp = None
-
-try:
-    import llama_cpp_cuda
-except:
-    llama_cpp_cuda = None
-
-try:
-    import llama_cpp_cuda_tensorcores
-except:
-    llama_cpp_cuda_tensorcores = None
-
-
-def llama_cpp_lib():
-    if shared.args.cpu and llama_cpp is not None:
-        return llama_cpp
-    elif shared.args.tensorcores and llama_cpp_cuda_tensorcores is not None:
-        return llama_cpp_cuda_tensorcores
-    elif llama_cpp_cuda is not None:
-        return llama_cpp_cuda
-    else:
-        return llama_cpp
-
 
 def ban_eos_logits_processor(eos_token, input_ids, logits):
     logits[eos_token] = -float('inf')
@@ -92,7 +67,7 @@ class LlamaCppModel:
             'mul_mat_q': not shared.args.no_mul_mat_q,
             'numa': shared.args.numa,
             'n_gpu_layers': shared.args.n_gpu_layers,
-            'rope_freq_base': RoPE.get_rope_freq_base(shared.args.alpha_value, shared.args.rope_freq_base),
+            'rope_freq_base': shared.args.rope_freq_base,
             'tensor_split': tensor_split_list,
             'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
             'offload_kqv': not shared.args.no_offload_kqv,
@@ -100,6 +75,13 @@ class LlamaCppModel:
             'flash_attn': shared.args.flash_attn
         }
 
+        if shared.args.cache_4bit:
+            params["type_k"] = 2
+            params["type_v"] = 2
+        elif shared.args.cache_8bit:
+            params["type_k"] = 8
+            params["type_v"] = 8
+
         result.model = Llama(**params)
         if cache_capacity > 0:
             result.model.set_cache(LlamaCache(capacity_bytes=cache_capacity))
diff --git a/modules/loaders.py b/modules/loaders.py
index cd9d0f88..75ed897b 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -21,8 +21,8 @@ loaders_and_params = OrderedDict({
         'trust_remote_code',
         'no_use_fast',
         'use_flash_attention_2',
+        'use_eager_attention',
         'alpha_value',
-        'rope_freq_base',
         'compress_pos_emb',
         'disable_exllama',
         'disable_exllamav2',
@@ -31,6 +31,8 @@ loaders_and_params = OrderedDict({
     'llama.cpp': [
         'n_ctx',
         'n_gpu_layers',
+        'cache_8bit',
+        'cache_4bit',
         'tensor_split',
         'n_batch',
         'threads',
@@ -38,7 +40,6 @@ loaders_and_params = OrderedDict({
         'no_mmap',
         'mlock',
         'no_mul_mat_q',
-        'alpha_value',
         'rope_freq_base',
         'compress_pos_emb',
         'cpu',
@@ -46,13 +47,15 @@ loaders_and_params = OrderedDict({
         'no_offload_kqv',
         'row_split',
         'tensorcores',
-        'flash-attn',
+        'flash_attn',
         'streaming_llm',
         'attention_sink_size',
     ],
     'llamacpp_HF': [
         'n_ctx',
         'n_gpu_layers',
+        'cache_8bit',
+        'cache_4bit',
         'tensor_split',
         'n_batch',
         'threads',
@@ -60,7 +63,6 @@ loaders_and_params = OrderedDict({
         'no_mmap',
         'mlock',
         'no_mul_mat_q',
-        'alpha_value',
         'rope_freq_base',
         'compress_pos_emb',
         'cpu',
@@ -72,7 +74,7 @@ loaders_and_params = OrderedDict({
         'no_offload_kqv',
         'row_split',
         'tensorcores',
-        'flash-attn',
+        'flash_attn',
         'streaming_llm',
         'attention_sink_size',
         'llamacpp_HF_info',
@@ -82,6 +84,8 @@ loaders_and_params = OrderedDict({
         'max_seq_len',
         'cfg_cache',
         'no_flash_attn',
+        'no_xformers',
+        'no_sdpa',
         'num_experts_per_token',
         'cache_8bit',
         'cache_4bit',
@@ -95,6 +99,8 @@ loaders_and_params = OrderedDict({
         'gpu_split',
         'max_seq_len',
         'no_flash_attn',
+        'no_xformers',
+        'no_sdpa',
         'num_experts_per_token',
         'cache_8bit',
         'cache_4bit',
@@ -134,6 +140,11 @@ loaders_and_params = OrderedDict({
         'hqq_backend',
         'trust_remote_code',
         'no_use_fast',
+    ],
+    'TensorRT-LLM': [
+        'max_seq_len',
+        'cpp_runner',
+        'tensorrt_llm_info',
     ]
 })
 
@@ -319,6 +330,16 @@ loaders_samplers = {
         'skip_special_tokens',
         'auto_max_new_tokens',
     },
+    'TensorRT-LLM': {
+        'temperature',
+        'top_p',
+        'top_k',
+        'repetition_penalty',
+        'presence_penalty',
+        'frequency_penalty',
+        'ban_eos_token',
+        'auto_max_new_tokens',
+    }
 }
 
 
diff --git a/modules/logits.py b/modules/logits.py
index 1fe2e73e..4233c8a5 100644
--- a/modules/logits.py
+++ b/modules/logits.py
@@ -16,15 +16,20 @@ def get_next_logits(*args, **kwargs):
     if shared.args.idle_timeout > 0 and shared.model is None and shared.previous_model_name not in [None, 'None']:
         shared.model, shared.tokenizer = load_model(shared.previous_model_name)
 
-    shared.generation_lock.acquire()
+    needs_lock = not args[2]  # use_samplers
+    if needs_lock:
+        shared.generation_lock.acquire()
+
     try:
         result = _get_next_logits(*args, **kwargs)
     except Exception:
         traceback.print_exc()
         result = None
 
-    models.last_generation_time = time.time()
-    shared.generation_lock.release()
+    if needs_lock:
+        models.last_generation_time = time.time()
+        shared.generation_lock.release()
+
     return result
 
 
diff --git a/modules/models.py b/modules/models.py
index b1d0b1cb..07c14308 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -1,5 +1,4 @@
 import gc
-import logging
 import os
 import pprint
 import re
@@ -26,10 +25,9 @@ from transformers import (
 )
 
 import modules.shared as shared
-from modules import RoPE, sampler_hijack
+from modules import sampler_hijack
 from modules.logging_colors import logger
 from modules.models_settings import get_model_metadata
-from modules.relative_imports import RelativeImport
 
 transformers.logging.set_verbosity_error()
 
@@ -79,6 +77,7 @@ def load_model(model_name, loader=None):
         'ExLlamav2_HF': ExLlamav2_HF_loader,
         'AutoAWQ': AutoAWQ_loader,
         'HQQ': HQQ_loader,
+        'TensorRT-LLM': TensorRT_LLM_loader,
     }
 
     metadata = get_model_metadata(model_name)
@@ -103,7 +102,7 @@ def load_model(model_name, loader=None):
             tokenizer = load_tokenizer(model_name, model)
 
     shared.settings.update({k: v for k, v in metadata.items() if k in shared.settings})
-    if loader.lower().startswith('exllama'):
+    if loader.lower().startswith('exllama') or loader.lower().startswith('tensorrt'):
         shared.settings['truncation_length'] = shared.args.max_seq_len
     elif loader in ['llama.cpp', 'llamacpp_HF']:
         shared.settings['truncation_length'] = shared.args.n_ctx
@@ -147,6 +146,9 @@ def huggingface_loader(model_name):
     if shared.args.force_safetensors:
         params['force_safetensors'] = True
 
+    if shared.args.use_eager_attention:
+        params['attn_implementation'] = 'eager'
+
     config = AutoConfig.from_pretrained(path_to_model, trust_remote_code=shared.args.trust_remote_code)
 
     if 'chatglm' in model_name.lower():
@@ -250,7 +252,7 @@ def huggingface_loader(model_name):
         if shared.args.compress_pos_emb > 1:
             params['rope_scaling'] = {'type': 'linear', 'factor': shared.args.compress_pos_emb}
         elif shared.args.alpha_value > 1:
-            params['rope_scaling'] = {'type': 'dynamic', 'factor': RoPE.get_alpha_value(shared.args.alpha_value, shared.args.rope_freq_base)}
+            params['rope_scaling'] = {'type': 'dynamic', 'factor': shared.args.alpha_value}
 
         logger.info("TRANSFORMERS_PARAMS=")
         pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(params)
@@ -267,7 +269,7 @@ def llamacpp_loader(model_name):
     if path.is_file():
         model_file = path
     else:
-        model_file = sorted(Path(f'{shared.args.model_dir}/{model_name}').glob('*.gguf'))[0] 
+        model_file = sorted(Path(f'{shared.args.model_dir}/{model_name}').glob('*.gguf'))[0]
 
     logger.info(f"llama.cpp weights detected: \"{model_file}\"")
     model, tokenizer = LlamaCppModel.from_pretrained(model_file)
@@ -339,6 +341,13 @@ def HQQ_loader(model_name):
     return model
 
 
+def TensorRT_LLM_loader(model_name):
+    from modules.tensorrt_llm import TensorRTLLMModel
+
+    model = TensorRTLLMModel.from_pretrained(model_name)
+    return model
+
+
 def get_max_memory_dict():
     max_memory = {}
     max_cpu_memory = shared.args.cpu_memory.strip() if shared.args.cpu_memory is not None else '99GiB'
diff --git a/modules/models_settings.py b/modules/models_settings.py
index 2d161ad5..7ae68125 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -9,6 +9,8 @@ from modules import chat, loaders, metadata_gguf, shared, ui
 
 def get_fallback_settings():
     return {
+        'bf16': False,
+        'use_eager_attention': False,
         'wbits': 'None',
         'groupsize': 'None',
         'desc_act': False,
@@ -16,6 +18,7 @@ def get_fallback_settings():
         'n_ctx': 2048,
         'rope_freq_base': 0,
         'compress_pos_emb': 1,
+        'alpha_value': 1,
         'truncation_length': shared.settings['truncation_length'],
         'skip_special_tokens': shared.settings['skip_special_tokens'],
         'custom_stopping_strings': shared.settings['custom_stopping_strings'],
@@ -58,13 +61,19 @@ def get_model_metadata(model):
                 model_settings['rope_freq_base'] = metadata[k]
             elif k.endswith('rope.scale_linear'):
                 model_settings['compress_pos_emb'] = metadata[k]
+            elif k.endswith('rope.scaling.factor'):
+                model_settings['compress_pos_emb'] = metadata[k]
             elif k.endswith('block_count'):
                 model_settings['n_gpu_layers'] = metadata[k] + 1
 
         if 'tokenizer.chat_template' in metadata:
             template = metadata['tokenizer.chat_template']
             eos_token = metadata['tokenizer.ggml.tokens'][metadata['tokenizer.ggml.eos_token_id']]
-            bos_token = metadata['tokenizer.ggml.tokens'][metadata['tokenizer.ggml.bos_token_id']]
+            if 'tokenizer.ggml.bos_token_id' in metadata:
+                bos_token = metadata['tokenizer.ggml.tokens'][metadata['tokenizer.ggml.bos_token_id']]
+            else:
+                bos_token = ""
+
             template = template.replace('eos_token', "'{}'".format(eos_token))
             template = template.replace('bos_token', "'{}'".format(bos_token))
 
@@ -77,6 +86,9 @@ def get_model_metadata(model):
         # Transformers metadata
         if hf_metadata is not None:
             metadata = json.loads(open(path, 'r', encoding='utf-8').read())
+            if 'pretrained_config' in metadata:
+                metadata = metadata['pretrained_config']
+
             for k in ['max_position_embeddings', 'model_max_length', 'max_seq_len']:
                 if k in metadata:
                     model_settings['truncation_length'] = metadata[k]
@@ -87,10 +99,18 @@ def get_model_metadata(model):
             elif 'attn_config' in metadata and 'rope_theta' in metadata['attn_config']:
                 model_settings['rope_freq_base'] = metadata['attn_config']['rope_theta']
 
-            if 'rope_scaling' in metadata and type(metadata['rope_scaling']) is dict and all(key in metadata['rope_scaling'] for key in ('type', 'factor')):
+            if 'rope_scaling' in metadata and isinstance(metadata['rope_scaling'], dict) and all(key in metadata['rope_scaling'] for key in ('type', 'factor')):
                 if metadata['rope_scaling']['type'] == 'linear':
                     model_settings['compress_pos_emb'] = metadata['rope_scaling']['factor']
 
+            # For Gemma-2
+            if 'torch_dtype' in metadata and metadata['torch_dtype'] == 'bfloat16':
+                model_settings['bf16'] = True
+
+            # For Gemma-2
+            if 'architectures' in metadata and isinstance(metadata['architectures'], list) and 'Gemma2ForCausalLM' in metadata['architectures']:
+                model_settings['use_eager_attention'] = True
+
             # Read GPTQ metadata for old GPTQ loaders
             if 'quantization_config' in metadata and metadata['quantization_config'].get('quant_method', '') != 'exl2':
                 if 'bits' in metadata['quantization_config']:
@@ -123,7 +143,7 @@ def get_model_metadata(model):
             for k in ['eos_token', 'bos_token']:
                 if k in metadata:
                     value = metadata[k]
-                    if type(value) is dict:
+                    if isinstance(value, dict):
                         value = value['content']
 
                     template = template.replace(k, "'{}'".format(value))
@@ -158,7 +178,7 @@ def infer_loader(model_name, model_settings):
     path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
     if not path_to_model.exists():
         loader = None
-    elif (path_to_model / 'quantize_config.json').exists() or ('wbits' in model_settings and type(model_settings['wbits']) is int and model_settings['wbits'] > 0):
+    elif (path_to_model / 'quantize_config.json').exists() or ('wbits' in model_settings and isinstance(model_settings['wbits'], int) and model_settings['wbits'] > 0):
         loader = 'ExLlamav2_HF'
     elif (path_to_model / 'quant_config.json').exists() or re.match(r'.*-awq', model_name.lower()):
         loader = 'AutoAWQ'
@@ -204,14 +224,11 @@ def update_model_parameters(state, initial=False):
             value = vars(shared.args_defaults)[element]
 
         # Making some simple conversions
-        if element in ['wbits', 'groupsize', 'pre_layer']:
+        if element in ['wbits', 'groupsize']:
             value = int(value)
         elif element == 'cpu_memory' and value is not None:
             value = f"{value}MiB"
 
-        if element in ['pre_layer']:
-            value = [value] if value > 0 else None
-
         setattr(shared.args, element, value)
 
     found_positive = False
diff --git a/modules/sampler_hijack.py b/modules/sampler_hijack.py
index 55d1997b..9fb661ae 100644
--- a/modules/sampler_hijack.py
+++ b/modules/sampler_hijack.py
@@ -204,21 +204,25 @@ class DRYLogitsProcessor(LogitsProcessor):
             input_ids = input_ids[:, -self._range:]
 
         for input_ids_row, scores_row in zip(input_ids, scores):
-            # Raw integer must be extracted here to check for set membership.
-            last_token = input_ids_row[-1].item()
+            # Use normal Python data types for improved performance
+            input_ids = input_ids_row.tolist()
 
+            last_token = input_ids[-1]
             if last_token in self.sequence_breakers:
                 continue
 
             # Exclude the last token as it always matches.
-            match_indices = (input_ids_row[:-1] == last_token).nonzero()
+            match_indices = []
+            for idx, val in enumerate(input_ids[:-1]):
+                if val == last_token:
+                    match_indices.append(idx)
 
             # Stores the maximum matching sequence length
             # for each token immediately following the sequence in the input.
             match_lengths = {}
 
             for i in match_indices:
-                next_token = input_ids_row[i+1].item()
+                next_token = input_ids[i + 1]
 
                 if next_token in self.sequence_breakers:
                     continue
@@ -227,15 +231,15 @@ class DRYLogitsProcessor(LogitsProcessor):
                 # so the match is at least of length 1.
                 match_length = 1
 
-                # Extend the match backwards as far as possible.
-                while True:
+                # Extend the match backwards (at most to 50 to prevent exponent overflow at penalty calculation) (this cap also improves performance on worst case)
+                while match_length < 50:
                     j = i - match_length
                     if j < 0:
                         # Start of input reached.
                         break
 
-                    previous_token = input_ids_row[-(match_length+1)].item()
-                    if input_ids_row[j] != previous_token:
+                    previous_token = input_ids[-(match_length + 1)]
+                    if input_ids[j] != previous_token:
                         # Start of match reached.
                         break
 
@@ -355,14 +359,14 @@ class RepetitionPenaltyLogitsProcessorWithRange(LogitsProcessor):
         return scores
 
 
-def get_logits_warper_patch(self, generation_config):
+def get_logits_warper_patch(self, generation_config, **kwargs):
 
     # Parameter sanitization
     if isinstance(generation_config.temperature, int):
         generation_config.temperature = float(generation_config.temperature)  # Must be float
 
     # Get the original warpers
-    warpers = self._get_logits_warper_old(generation_config)
+    warpers = self._get_logits_warper_old(generation_config, **kwargs)
 
     # Replace temperature with our modified class.
     # Currently, it behaves identically to the original.
diff --git a/modules/shared.py b/modules/shared.py
index 373089dc..d96e3156 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -106,6 +106,7 @@ group.add_argument('--trust-remote-code', action='store_true', help='Set trust_r
 group.add_argument('--force-safetensors', action='store_true', help='Set use_safetensors=True while loading the model. This prevents arbitrary code execution.')
 group.add_argument('--no_use_fast', action='store_true', help='Set use_fast=False while loading the tokenizer (it\'s True by default). Use this if you have any problems related to use_fast.')
 group.add_argument('--use_flash_attention_2', action='store_true', help='Set use_flash_attention_2=True while loading the model.')
+group.add_argument('--use_eager_attention', action='store_true', help='Set attn_implementation= eager while loading the model.')
 
 # bitsandbytes 4-bit
 group = parser.add_argument_group('bitsandbytes 4-bit')
@@ -142,6 +143,8 @@ group.add_argument('--autosplit', action='store_true', help='Autosplit the model
 group.add_argument('--max_seq_len', type=int, default=2048, help='Maximum sequence length.')
 group.add_argument('--cfg-cache', action='store_true', help='ExLlamav2_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.')
 group.add_argument('--no_flash_attn', action='store_true', help='Force flash-attention to not be used.')
+group.add_argument('--no_xformers', action='store_true', help='Force xformers to not be used.')
+group.add_argument('--no_sdpa', action='store_true', help='Force Torch SDPA to not be used.')
 group.add_argument('--cache_8bit', action='store_true', help='Use 8-bit cache to save VRAM.')
 group.add_argument('--cache_4bit', action='store_true', help='Use Q4 cache to save VRAM.')
 group.add_argument('--num_experts_per_token', type=int, default=2, help='Number of experts to use for generation. Applies to MoE models like Mixtral.')
@@ -165,6 +168,10 @@ group.add_argument('--no_inject_fused_attention', action='store_true', help='Dis
 group = parser.add_argument_group('HQQ')
 group.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.')
 
+# TensorRT-LLM
+group = parser.add_argument_group('TensorRT-LLM')
+group.add_argument('--cpp-runner', action='store_true', help='Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn\'t support streaming yet.')
+
 # DeepSpeed
 group = parser.add_argument_group('DeepSpeed')
 group.add_argument('--deepspeed', action='store_true', help='Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.')
@@ -263,6 +270,8 @@ def fix_loader_name(name):
         return 'AutoAWQ'
     elif name in ['hqq']:
         return 'HQQ'
+    elif name in ['tensorrt', 'tensorrtllm', 'tensorrt_llm', 'tensorrt-llm', 'tensort', 'tensortllm']:
+        return 'TensorRT-LLM'
 
 
 def add_extension(name, last=False):
diff --git a/modules/tensorrt_llm.py b/modules/tensorrt_llm.py
new file mode 100644
index 00000000..c2685b75
--- /dev/null
+++ b/modules/tensorrt_llm.py
@@ -0,0 +1,131 @@
+from pathlib import Path
+
+import tensorrt_llm
+import torch
+from tensorrt_llm.runtime import ModelRunner, ModelRunnerCpp
+
+from modules import shared
+from modules.logging_colors import logger
+from modules.text_generation import (
+    get_max_prompt_length,
+    get_reply_from_output_ids
+)
+
+
+class TensorRTLLMModel:
+    def __init__(self):
+        pass
+
+    @classmethod
+    def from_pretrained(self, path_to_model):
+
+        path_to_model = Path(f'{shared.args.model_dir}') / Path(path_to_model)
+        runtime_rank = tensorrt_llm.mpi_rank()
+
+        # Define model settings
+        runner_kwargs = dict(
+            engine_dir=str(path_to_model),
+            lora_dir=None,
+            rank=runtime_rank,
+            debug_mode=False,
+            lora_ckpt_source="hf",
+        )
+
+        if shared.args.cpp_runner:
+            logger.info("TensorRT-LLM: Using \"ModelRunnerCpp\"")
+            runner_kwargs.update(
+                max_batch_size=1,
+                max_input_len=shared.args.max_seq_len - 512,
+                max_output_len=512,
+                max_beam_width=1,
+                max_attention_window_size=None,
+                sink_token_length=None,
+            )
+        else:
+            logger.info("TensorRT-LLM: Using \"ModelRunner\"")
+
+        # Load the model
+        runner_cls = ModelRunnerCpp if shared.args.cpp_runner else ModelRunner
+        runner = runner_cls.from_dir(**runner_kwargs)
+
+        result = self()
+        result.model = runner
+        result.runtime_rank = runtime_rank
+
+        return result
+
+    def generate_with_streaming(self, prompt, state):
+        batch_input_ids = []
+        input_ids = shared.tokenizer.encode(
+            prompt,
+            add_special_tokens=True,
+            truncation=False,
+        )
+        input_ids = torch.tensor(input_ids, dtype=torch.int32)
+        input_ids = input_ids[-get_max_prompt_length(state):]  # Apply truncation_length
+        batch_input_ids.append(input_ids)
+
+        if shared.args.cpp_runner:
+            max_new_tokens = min(512, state['max_new_tokens'])
+        elif state['auto_max_new_tokens']:
+            max_new_tokens = state['truncation_length'] - input_ids.shape[-1]
+        else:
+            max_new_tokens = state['max_new_tokens']
+
+        with torch.no_grad():
+            generator = self.model.generate(
+                batch_input_ids,
+                max_new_tokens=max_new_tokens,
+                max_attention_window_size=None,
+                sink_token_length=None,
+                end_id=shared.tokenizer.eos_token_id if not state['ban_eos_token'] else -1,
+                pad_id=shared.tokenizer.pad_token_id or shared.tokenizer.eos_token_id,
+                temperature=state['temperature'],
+                top_k=state['top_k'],
+                top_p=state['top_p'],
+                num_beams=1,
+                length_penalty=1.0,
+                repetition_penalty=state['repetition_penalty'],
+                presence_penalty=state['presence_penalty'],
+                frequency_penalty=state['frequency_penalty'],
+                stop_words_list=None,
+                bad_words_list=None,
+                lora_uids=None,
+                prompt_table_path=None,
+                prompt_tasks=None,
+                streaming=not shared.args.cpp_runner,
+                output_sequence_lengths=True,
+                return_dict=True,
+                medusa_choices=None
+            )
+
+        torch.cuda.synchronize()
+
+        cumulative_reply = ''
+        starting_from = batch_input_ids[0].shape[-1]
+
+        if shared.args.cpp_runner:
+            sequence_length = generator['sequence_lengths'][0].item()
+            output_ids = generator['output_ids'][0][0][:sequence_length].tolist()
+
+            cumulative_reply += get_reply_from_output_ids(output_ids, state, starting_from=starting_from)
+            starting_from = sequence_length
+            yield cumulative_reply
+        else:
+            for curr_outputs in generator:
+                if shared.stop_everything:
+                    break
+
+                sequence_length = curr_outputs['sequence_lengths'][0].item()
+                output_ids = curr_outputs['output_ids'][0][0][:sequence_length].tolist()
+
+                cumulative_reply += get_reply_from_output_ids(output_ids, state, starting_from=starting_from)
+                starting_from = sequence_length
+                yield cumulative_reply
+
+    def generate(self, prompt, state):
+        output = ''
+        for output in self.generate_with_streaming(prompt, state):
+            pass
+
+        return output
diff --git a/modules/text_generation.py b/modules/text_generation.py
index 4f324239..d971a30e 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -54,7 +54,7 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
             yield ''
             return
 
-        if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model']:
+        if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'TensorRTLLMModel']:
             generate_func = generate_reply_custom
         else:
             generate_func = generate_reply_HF
@@ -132,14 +132,14 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
     if shared.tokenizer is None:
         raise ValueError('No tokenizer is loaded')
 
-    if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model']:
+    if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'TensorRTLLMModel']:
         input_ids = shared.tokenizer.encode(str(prompt))
         if shared.model.__class__.__name__ not in ['Exllamav2Model']:
             input_ids = np.array(input_ids).reshape(1, len(input_ids))
     else:
         input_ids = shared.tokenizer.encode(str(prompt), return_tensors='pt', add_special_tokens=add_special_tokens)
 
-        if hasattr(shared.tokenizer, 'bos_token_id'):
+        if hasattr(shared.tokenizer, 'bos_token_id') and shared.tokenizer.bos_token_id is not None:
             if add_bos_token:
                 if (len(input_ids[0]) > 0 and input_ids[0][0] != shared.tokenizer.bos_token_id) or len(input_ids[0]) == 0:
                     # Add a missing bos token (it may not have been added due to faulty model metadata)
@@ -158,7 +158,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
     if truncation_length is not None:
         input_ids = input_ids[:, -truncation_length:]
 
-    if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model'] or shared.args.cpu:
+    if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'TensorRTLLMModel'] or shared.args.cpu:
         return input_ids
     elif shared.args.deepspeed:
         import deepspeed
diff --git a/modules/ui.py b/modules/ui.py
index 0126190a..d77266ce 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -43,6 +43,11 @@ theme = gr.themes.Default(
     body_text_color_subdued='#484848',
     background_fill_secondary='#eaeaea',
     background_fill_primary='var(--neutral-50)',
+    body_background_fill="white",
+    block_background_fill="#f4f4f4",
+    body_text_color="#333",
+    button_secondary_background_fill="#f4f4f4",
+    button_secondary_border_color="var(--border-color-primary)"
 )
 
 if Path("notification.mp3").exists():
@@ -64,13 +69,13 @@ def list_model_elements():
         'trust_remote_code',
         'no_use_fast',
         'use_flash_attention_2',
+        'use_eager_attention',
         'load_in_4bit',
         'compute_dtype',
         'quant_type',
         'use_double_quant',
         'wbits',
         'groupsize',
-        'pre_layer',
         'triton',
         'desc_act',
         'no_inject_fused_attention',
@@ -80,6 +85,8 @@ def list_model_elements():
         'disable_exllamav2',
         'cfg_cache',
         'no_flash_attn',
+        'no_xformers',
+        'no_sdpa',
         'num_experts_per_token',
         'cache_8bit',
         'cache_4bit',
@@ -103,10 +110,11 @@ def list_model_elements():
         'no_offload_kqv',
         'row_split',
         'tensorcores',
-        'flash-attn',
+        'flash_attn',
         'streaming_llm',
         'attention_sink_size',
         'hqq_backend',
+        'cpp_runner',
     ]
     if is_torch_xpu_available():
         for i in range(torch.xpu.device_count()):
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 3193bd67..c6f6ddb0 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -19,7 +19,7 @@ def create_ui():
     mu = shared.args.multi_user
 
     shared.gradio['Chat input'] = gr.State()
-    shared.gradio['history'] = gr.State({'internal': [], 'visible': []})
+    shared.gradio['history'] = gr.JSON({'internal': [], 'visible': []}, visible=False)
 
     with gr.Tab('Chat', elem_id='chat-tab', elem_classes=("old-ui" if shared.args.chat_buttons else None)):
         with gr.Row():
@@ -62,9 +62,6 @@ def create_ui():
 
         with gr.Row(elem_id='past-chats-row', elem_classes=['pretty_scrollbar']):
             with gr.Column():
-                with gr.Row():
-                    shared.gradio['unique_id'] = gr.Dropdown(label='Past chats', elem_classes=['slim-dropdown'], interactive=not mu)
-
                 with gr.Row():
                     shared.gradio['rename_chat'] = gr.Button('Rename', elem_classes='refresh-button', interactive=not mu)
                     shared.gradio['delete_chat'] = gr.Button('🗑️', elem_classes='refresh-button', interactive=not mu)
@@ -74,8 +71,13 @@ def create_ui():
 
                 with gr.Row(elem_id='rename-row'):
                     shared.gradio['rename_to'] = gr.Textbox(label='Rename to:', placeholder='New name', visible=False, elem_classes=['no-background'])
-                    shared.gradio['rename_to-confirm'] = gr.Button('Confirm', visible=False, elem_classes=['refresh-button', 'focus-on-chat-input'])
-                    shared.gradio['rename_to-cancel'] = gr.Button('Cancel', visible=False, elem_classes=['refresh-button', 'focus-on-chat-input'])
+                    with gr.Row():
+                        shared.gradio['rename_to-confirm'] = gr.Button('Confirm', visible=False, elem_classes=['refresh-button', 'focus-on-chat-input'])
+                        shared.gradio['rename_to-cancel'] = gr.Button('Cancel', visible=False, elem_classes=['refresh-button', 'focus-on-chat-input'])
+
+                gr.Markdown("Past chats")
+                with gr.Row():
+                    shared.gradio['unique_id'] = gr.Radio(label="", elem_classes=['slim-dropdown', 'pretty_scrollbar'], interactive=not mu, elem_id='past-chats')
 
         with gr.Row(elem_id='chat-controls', elem_classes=['pretty_scrollbar']):
             with gr.Column():
@@ -83,13 +85,13 @@ def create_ui():
                     shared.gradio['start_with'] = gr.Textbox(label='Start reply with', placeholder='Sure thing!', value=shared.settings['start_with'], elem_classes=['add_scrollbar'])
 
                 with gr.Row():
-                    shared.gradio['mode'] = gr.Radio(choices=['chat', 'chat-instruct', 'instruct'], value='chat', label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template selected under Parameters > Instruction template must match the current model.', elem_id='chat-mode')
+                    shared.gradio['mode'] = gr.Radio(choices=['chat', 'chat-instruct', 'instruct'], label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template Parameters > Instruction template is used.', elem_id='chat-mode')
 
                 with gr.Row():
                     shared.gradio['chat_style'] = gr.Dropdown(choices=utils.get_available_chat_styles(), label='Chat style', value=shared.settings['chat_style'], visible=shared.settings['mode'] != 'instruct')
 
                 with gr.Row():
-                    shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=16, label='Command for chat-instruct mode', info='<|character|> and <|prompt|> get replaced with the bot name and the regular chat prompt respectively.', visible=False, elem_classes=['add_scrollbar'])
+                    shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=12, label='Command for chat-instruct mode', info='<|character|> and <|prompt|> get replaced with the bot name and the regular chat prompt respectively.', visible=False, elem_classes=['add_scrollbar'])
 
 
 def create_chat_settings_ui():
@@ -101,7 +103,7 @@ def create_chat_settings_ui():
                     with gr.Row():
                         shared.gradio['character_menu'] = gr.Dropdown(value=None, choices=utils.get_available_characters(), label='Character', elem_id='character-menu', info='Used in chat and chat-instruct modes.', elem_classes='slim-dropdown')
                         ui.create_refresh_button(shared.gradio['character_menu'], lambda: None, lambda: {'choices': utils.get_available_characters()}, 'refresh-button', interactive=not mu)
-                        shared.gradio['save_character'] = gr.Button('💾', elem_classes='refresh-button', interactive=not mu)
+                        shared.gradio['save_character'] = gr.Button('💾', elem_classes='refresh-button', elem_id="save-character", interactive=not mu)
                         shared.gradio['delete_character'] = gr.Button('🗑️', elem_classes='refresh-button', interactive=not mu)
 
                     shared.gradio['name2'] = gr.Textbox(value='', lines=1, label='Character\'s name')
@@ -181,7 +183,7 @@ def create_event_handlers():
         chat.generate_chat_reply_wrapper, gradio(inputs), gradio('display', 'history'), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None).then(
-        lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
+        None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['textbox'].submit(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
@@ -189,28 +191,28 @@ def create_event_handlers():
         chat.generate_chat_reply_wrapper, gradio(inputs), gradio('display', 'history'), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None).then(
-        lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
+        None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['Regenerate'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         partial(chat.generate_chat_reply_wrapper, regenerate=True), gradio(inputs), gradio('display', 'history'), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None).then(
-        lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
+        None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['Continue'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         partial(chat.generate_chat_reply_wrapper, _continue=True), gradio(inputs), gradio('display', 'history'), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None).then(
-        lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
+        None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['Impersonate'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         lambda x: x, gradio('textbox'), gradio('Chat input'), show_progress=False).then(
         chat.impersonate_wrapper, gradio(inputs), gradio('textbox', 'display'), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
+        None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['Replace last reply'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
@@ -252,7 +254,7 @@ def create_event_handlers():
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.start_new_chat, gradio('interface_state'), gradio('history')).then(
         chat.redraw_html, gradio(reload_arr), gradio('display')).then(
-        lambda x: gr.update(choices=(histories := chat.find_all_histories(x)), value=histories[0]), gradio('interface_state'), gradio('unique_id'))
+        lambda x: gr.update(choices=(histories := chat.find_all_histories_with_first_prompts(x)), value=histories[0][1]), gradio('interface_state'), gradio('unique_id'), show_progress=False)
 
     shared.gradio['delete_chat'].click(lambda: [gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)], None, gradio(clear_arr))
     shared.gradio['delete_chat-cancel'].click(lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, gradio(clear_arr))
@@ -260,12 +262,12 @@ def create_event_handlers():
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         lambda x, y: str(chat.find_all_histories(x).index(y)), gradio('interface_state', 'unique_id'), gradio('temporary_text')).then(
         chat.delete_history, gradio('unique_id', 'character_menu', 'mode'), None).then(
-        chat.load_history_after_deletion, gradio('interface_state', 'temporary_text'), gradio('history', 'unique_id')).then(
+        chat.load_history_after_deletion, gradio('interface_state', 'temporary_text'), gradio('history', 'unique_id'), show_progress=False).then(
         chat.redraw_html, gradio(reload_arr), gradio('display')).then(
         lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, gradio(clear_arr))
 
     shared.gradio['rename_chat'].click(
-        lambda x: x, gradio('unique_id'), gradio('rename_to')).then(
+        lambda: "My New Chat", None, gradio('rename_to')).then(
         lambda: [gr.update(visible=True)] * 3, None, gradio('rename_to', 'rename_to-confirm', 'rename_to-cancel'), show_progress=False)
 
     shared.gradio['rename_to-cancel'].click(
@@ -274,36 +276,38 @@ def create_event_handlers():
     shared.gradio['rename_to-confirm'].click(
         chat.rename_history, gradio('unique_id', 'rename_to', 'character_menu', 'mode'), None).then(
         lambda: [gr.update(visible=False)] * 3, None, gradio('rename_to', 'rename_to-confirm', 'rename_to-cancel'), show_progress=False).then(
-        lambda x, y: gr.update(choices=chat.find_all_histories(x), value=y), gradio('interface_state', 'rename_to'), gradio('unique_id'))
+        lambda x, y: gr.update(choices=chat.find_all_histories_with_first_prompts(x), value=y), gradio('interface_state', 'rename_to'), gradio('unique_id'))
 
     shared.gradio['rename_to'].submit(
         chat.rename_history, gradio('unique_id', 'rename_to', 'character_menu', 'mode'), None).then(
         lambda: [gr.update(visible=False)] * 3, None, gradio('rename_to', 'rename_to-confirm', 'rename_to-cancel'), show_progress=False).then(
-        lambda x, y: gr.update(choices=chat.find_all_histories(x), value=y), gradio('interface_state', 'rename_to'), gradio('unique_id'))
+        lambda x, y: gr.update(choices=chat.find_all_histories_with_first_prompts(x), value=y), gradio('interface_state', 'rename_to'), gradio('unique_id'))
 
     shared.gradio['load_chat_history'].upload(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.start_new_chat, gradio('interface_state'), gradio('history')).then(
         chat.load_history_json, gradio('load_chat_history', 'history'), gradio('history')).then(
         chat.redraw_html, gradio(reload_arr), gradio('display')).then(
-        lambda x: gr.update(choices=(histories := chat.find_all_histories(x)), value=histories[0]), gradio('interface_state'), gradio('unique_id')).then(
+        lambda x: gr.update(choices=(histories := chat.find_all_histories_with_first_prompts(x)), value=histories[0][1]), gradio('interface_state'), gradio('unique_id'), show_progress=False).then(
         chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None).then(
-        lambda: None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_chat()}}')
+        None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_chat()}}')
 
     shared.gradio['character_menu'].change(
         chat.load_character, gradio('character_menu', 'name1', 'name2'), gradio('name1', 'name2', 'character_picture', 'greeting', 'context')).success(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.load_latest_history, gradio('interface_state'), gradio('history')).then(
         chat.redraw_html, gradio(reload_arr), gradio('display')).then(
-        lambda x: gr.update(choices=(histories := chat.find_all_histories(x)), value=histories[0]), gradio('interface_state'), gradio('unique_id')).then(
-        lambda: None, None, None, js=f'() => {{{ui.update_big_picture_js}; updateBigPicture()}}')
+        lambda x: gr.update(choices=(histories := chat.find_all_histories_with_first_prompts(x)), value=histories[0][1]), gradio('interface_state'), gradio('unique_id'), show_progress=False).then(
+        None, None, None, js=f'() => {{{ui.update_big_picture_js}; updateBigPicture()}}')
+
+    shared.gradio['mode'].change(None, gradio('mode'), None, js="(mode) => {mode === 'instruct' ? document.getElementById('character-menu').parentNode.parentNode.style.display = 'none' : document.getElementById('character-menu').parentNode.parentNode.style.display = ''}")
 
     shared.gradio['mode'].change(
         lambda x: [gr.update(visible=x != 'instruct'), gr.update(visible=x == 'chat-instruct')], gradio('mode'), gradio('chat_style', 'chat-instruct_command'), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.load_latest_history, gradio('interface_state'), gradio('history')).then(
         chat.redraw_html, gradio(reload_arr), gradio('display')).then(
-        lambda x: gr.update(choices=(histories := chat.find_all_histories(x)), value=histories[0]), gradio('interface_state'), gradio('unique_id'))
+        lambda x: gr.update(choices=(histories := chat.find_all_histories_with_first_prompts(x)), value=histories[0][1]), gradio('interface_state'), gradio('unique_id'), show_progress=False)
 
     shared.gradio['chat_style'].change(chat.redraw_html, gradio(reload_arr), gradio('display'))
     shared.gradio['Copy last reply'].click(chat.send_last_reply_to_input, gradio('history'), gradio('textbox'), show_progress=False)
@@ -336,11 +340,11 @@ def create_event_handlers():
 
     shared.gradio['Submit character'].click(
         chat.upload_character, gradio('upload_json', 'upload_img_bot'), gradio('character_menu')).then(
-        lambda: None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_character()}}')
+        None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_character()}}')
 
     shared.gradio['Submit tavern character'].click(
         chat.upload_tavern_character, gradio('upload_img_tavern', 'tavern_json'), gradio('character_menu')).then(
-        lambda: None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_character()}}')
+        None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_character()}}')
 
     shared.gradio['upload_json'].upload(lambda: gr.update(interactive=True), None, gradio('Submit character'))
     shared.gradio['upload_json'].clear(lambda: gr.update(interactive=False), None, gradio('Submit character'))
@@ -354,28 +358,28 @@ def create_event_handlers():
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         lambda x: x.update({'mode': 'instruct', 'history': {'internal': [], 'visible': []}}), gradio('interface_state'), None).then(
         partial(chat.generate_chat_prompt, 'Input'), gradio('interface_state'), gradio('textbox-default')).then(
-        lambda: None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_default()}}')
+        None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_default()}}')
 
     shared.gradio['send_instruction_to_notebook'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         lambda x: x.update({'mode': 'instruct', 'history': {'internal': [], 'visible': []}}), gradio('interface_state'), None).then(
         partial(chat.generate_chat_prompt, 'Input'), gradio('interface_state'), gradio('textbox-notebook')).then(
-        lambda: None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_notebook()}}')
+        None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_notebook()}}')
 
     shared.gradio['send_instruction_to_negative_prompt'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         lambda x: x.update({'mode': 'instruct', 'history': {'internal': [], 'visible': []}}), gradio('interface_state'), None).then(
         partial(chat.generate_chat_prompt, 'Input'), gradio('interface_state'), gradio('negative_prompt')).then(
-        lambda: None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_generation_parameters()}}')
+        None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_generation_parameters()}}')
 
     shared.gradio['send-chat-to-default'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         partial(chat.generate_chat_prompt, '', _continue=True), gradio('interface_state'), gradio('textbox-default')).then(
-        lambda: None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_default()}}')
+        None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_default()}}')
 
     shared.gradio['send-chat-to-notebook'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         partial(chat.generate_chat_prompt, '', _continue=True), gradio('interface_state'), gradio('textbox-notebook')).then(
-        lambda: None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_notebook()}}')
+        None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_notebook()}}')
 
-    shared.gradio['show_controls'].change(lambda x: None, gradio('show_controls'), None, js=f'(x) => {{{ui.show_controls_js}; toggle_controls(x)}}')
+    shared.gradio['show_controls'].change(None, gradio('show_controls'), None, js=f'(x) => {{{ui.show_controls_js}; toggle_controls(x)}}')
diff --git a/modules/ui_default.py b/modules/ui_default.py
index 1f962551..e3bfe784 100644
--- a/modules/ui_default.py
+++ b/modules/ui_default.py
@@ -16,7 +16,6 @@ outputs = ('output_textbox', 'html-default')
 def create_ui():
     mu = shared.args.multi_user
     with gr.Tab('Default', elem_id='default-tab'):
-        shared.gradio['last_input-default'] = gr.State('')
         with gr.Row():
             with gr.Column():
                 with gr.Row():
@@ -63,25 +62,23 @@ def create_ui():
 
 def create_event_handlers():
     shared.gradio['Generate-default'].click(
-        lambda x: x, gradio('textbox-default'), gradio('last_input-default')).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
+        None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['textbox-default'].submit(
-        lambda x: x, gradio('textbox-default'), gradio('last_input-default')).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
+        None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['markdown_render-default'].click(lambda x: x, gradio('output_textbox'), gradio('markdown-default'), queue=False)
     shared.gradio['Continue-default'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         generate_reply_wrapper, [shared.gradio['output_textbox']] + gradio(inputs)[1:], gradio(outputs), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
+        None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['Stop-default'].click(stop_everything_event, None, None, queue=False)
     shared.gradio['prompt_menu-default'].change(load_prompt, gradio('prompt_menu-default'), gradio('textbox-default'), show_progress=False)
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 6cda7ecd..7a85020f 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -101,13 +101,12 @@ def create_ui():
                             shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
                             shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=shared.args.wbits if shared.args.wbits > 0 else "None")
                             shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=shared.args.groupsize if shared.args.groupsize > 0 else "None")
-                            shared.gradio['pre_layer'] = gr.Slider(label="pre_layer", minimum=0, maximum=100, value=shared.args.pre_layer[0] if shared.args.pre_layer is not None else 0)
                             shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
                             shared.gradio['max_seq_len'] = gr.Slider(label='max_seq_len', minimum=0, maximum=shared.settings['truncation_length_max'], step=256, info='Context length. Try lowering this if you run out of memory while loading the model.', value=shared.args.max_seq_len)
                             with gr.Blocks():
                                 shared.gradio['alpha_value'] = gr.Slider(label='alpha_value', minimum=1, maximum=8, step=0.05, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.', value=shared.args.alpha_value)
-                                shared.gradio['rope_freq_base'] = gr.Slider(label='rope_freq_base', minimum=0, maximum=1000000, step=1000, info='If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63)', value=shared.args.rope_freq_base)
-                                shared.gradio['compress_pos_emb'] = gr.Slider(label='compress_pos_emb', minimum=1, maximum=8, step=1, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.', value=shared.args.compress_pos_emb)
+                                shared.gradio['rope_freq_base'] = gr.Slider(label='rope_freq_base', minimum=0, maximum=20000000, step=1000, info='If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63)', value=shared.args.rope_freq_base)
+                                shared.gradio['compress_pos_emb'] = gr.Slider(label='compress_pos_emb', minimum=1, maximum=8, step=0.1, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.', value=shared.args.compress_pos_emb)
 
                             shared.gradio['autogptq_info'] = gr.Markdown('ExLlamav2_HF is recommended over AutoGPTQ for models derived from Llama.')
 
@@ -116,9 +115,12 @@ def create_ui():
                             shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
                             shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant)
                             shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.')
-                            shared.gradio['flash-attn'] = gr.Checkbox(label="flash-attn", value=shared.args.flash_attn, info='Use flash-attention.')
+                            shared.gradio['use_eager_attention'] = gr.Checkbox(label="use_eager_attention", value=shared.args.use_eager_attention, info='Set attn_implementation= eager while loading the model.')
+                            shared.gradio['flash_attn'] = gr.Checkbox(label="flash_attn", value=shared.args.flash_attn, info='Use flash-attention.')
                             shared.gradio['auto_devices'] = gr.Checkbox(label="auto-devices", value=shared.args.auto_devices)
                             shared.gradio['tensorcores'] = gr.Checkbox(label="tensorcores", value=shared.args.tensorcores, info='NVIDIA only: use llama-cpp-python compiled with tensor cores support. This increases performance on RTX cards.')
+                            shared.gradio['cache_8bit'] = gr.Checkbox(label="cache_8bit", value=shared.args.cache_8bit, info='Use 8-bit cache to save VRAM.')
+                            shared.gradio['cache_4bit'] = gr.Checkbox(label="cache_4bit", value=shared.args.cache_4bit, info='Use Q4 cache to save VRAM.')
                             shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming_llm", value=shared.args.streaming_llm, info='(experimental) Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
                             shared.gradio['attention_sink_size'] = gr.Number(label="attention_sink_size", value=shared.args.attention_sink_size, precision=0, info='StreamingLLM: number of sink tokens. Only used if the trimmed prompt doesn\'t share a prefix with the old prompt.')
                             shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.')
@@ -135,11 +137,12 @@ def create_ui():
                             shared.gradio['numa'] = gr.Checkbox(label="numa", value=shared.args.numa, info='NUMA support can help on some systems with non-uniform memory access.')
                             shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)
                             shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16)
-                            shared.gradio['cache_8bit'] = gr.Checkbox(label="cache_8bit", value=shared.args.cache_8bit, info='Use 8-bit cache to save VRAM.')
-                            shared.gradio['cache_4bit'] = gr.Checkbox(label="cache_4bit", value=shared.args.cache_4bit, info='Use Q4 cache to save VRAM.')
                             shared.gradio['autosplit'] = gr.Checkbox(label="autosplit", value=shared.args.autosplit, info='Automatically split the model tensors across the available GPUs.')
-                            shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn, info='Force flash-attention to not be used.')
+                            shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn)
+                            shared.gradio['no_xformers'] = gr.Checkbox(label="no_xformers", value=shared.args.no_xformers)
+                            shared.gradio['no_sdpa'] = gr.Checkbox(label="no_sdpa", value=shared.args.no_sdpa)
                             shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.')
+                            shared.gradio['cpp_runner'] = gr.Checkbox(label="cpp-runner", value=shared.args.cpp_runner, info='Enable inference with ModelRunnerCpp, which is faster than the default ModelRunner.')
                             shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.')
                             with gr.Blocks():
                                 shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code)
@@ -148,9 +151,9 @@ def create_ui():
 
                             shared.gradio['disable_exllama'] = gr.Checkbox(label="disable_exllama", value=shared.args.disable_exllama, info='Disable ExLlama kernel for GPTQ models.')
                             shared.gradio['disable_exllamav2'] = gr.Checkbox(label="disable_exllamav2", value=shared.args.disable_exllamav2, info='Disable ExLlamav2 kernel for GPTQ models.')
-                            shared.gradio['gptq_for_llama_info'] = gr.Markdown('Legacy loader for compatibility with older GPUs. ExLlamav2_HF or AutoGPTQ are preferred for GPTQ models when supported.')
                             shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.")
                             shared.gradio['llamacpp_HF_info'] = gr.Markdown("llamacpp_HF loads llama.cpp as a Transformers model. To use it, you need to place your GGUF in a subfolder of models/ with the necessary tokenizer files.\n\nYou can use the \"llamacpp_HF creator\" menu to do that automatically.")
+                            shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `max_seq_len` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.')
 
             with gr.Column():
                 with gr.Row():
diff --git a/modules/ui_notebook.py b/modules/ui_notebook.py
index a7c62baf..307bc0f3 100644
--- a/modules/ui_notebook.py
+++ b/modules/ui_notebook.py
@@ -67,14 +67,14 @@ def create_event_handlers():
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
+        None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['textbox-notebook'].submit(
         lambda x: x, gradio('textbox-notebook'), gradio('last_input-notebook')).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
+        None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['Undo'].click(lambda x: x, gradio('last_input-notebook'), gradio('textbox-notebook'), show_progress=False)
     shared.gradio['markdown_render-notebook'].click(lambda x: x, gradio('textbox-notebook'), gradio('markdown-notebook'), queue=False)
@@ -83,7 +83,7 @@ def create_event_handlers():
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
+        None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['Stop-notebook'].click(stop_everything_event, None, None, queue=False)
     shared.gradio['prompt_menu-notebook'].change(load_prompt, gradio('prompt_menu-notebook'), gradio('textbox-notebook'), show_progress=False)
diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py
index d62b74c1..68512c7e 100644
--- a/modules/ui_parameters.py
+++ b/modules/ui_parameters.py
@@ -40,7 +40,6 @@ def create_ui(default_preset):
                             shared.gradio['do_sample'] = gr.Checkbox(value=generate_params['do_sample'], label='do_sample')
 
                             with gr.Blocks():
-                                gr.Markdown("[DRY sequence repetition penalty](https://github.com/oobabooga/text-generation-webui/pull/5677)")
                                 shared.gradio['dry_multiplier'] = gr.Slider(0, 5, value=generate_params['dry_multiplier'], step=0.01, label='dry_multiplier', info='Set to value > 0 to enable DRY. Controls the magnitude of the penalty for the shortest penalized sequences.')
                                 shared.gradio['dry_base'] = gr.Slider(1, 4, value=generate_params['dry_base'], step=0.01, label='dry_base', info='Controls how fast the penalty grows with increasing sequence length.')
                                 shared.gradio['dry_allowed_length'] = gr.Slider(1, 20, value=generate_params['dry_allowed_length'], step=1, label='dry_allowed_length', info='Longest sequence that can be repeated without being penalized.')
diff --git a/modules/ui_session.py b/modules/ui_session.py
index 08929c33..087091ce 100644
--- a/modules/ui_session.py
+++ b/modules/ui_session.py
@@ -32,10 +32,10 @@ def create_ui():
         # Reset interface event
         shared.gradio['reset_interface'].click(
             set_interface_arguments, gradio('extensions_menu', 'bool_menu'), None).then(
-            lambda: None, None, None, js='() => {document.body.innerHTML=\'<h1 style="font-family:monospace;padding-top:20%;margin:0;height:100vh;color:lightgray;text-align:center;background:var(--body-background-fill)">Reloading...</h1>\'; setTimeout(function(){location.reload()},2500); return []}')
+            None, None, None, js='() => {document.body.innerHTML=\'<h1 style="font-family:monospace;padding-top:20%;margin:0;height:100vh;color:lightgray;text-align:center;background:var(--body-background-fill)">Reloading...</h1>\'; setTimeout(function(){location.reload()},2500); return []}')
 
         shared.gradio['toggle_dark_mode'].click(
-            lambda: None, None, None, js='() => {document.getElementsByTagName("body")[0].classList.toggle("dark")}').then(
+            None, None, None, js='() => {document.getElementsByTagName("body")[0].classList.toggle("dark")}').then(
             lambda x: 'dark' if x == 'light' else 'light', gradio('theme_state'), gradio('theme_state'))
 
         shared.gradio['save_settings'].click(
diff --git a/one_click.py b/one_click.py
index 55a4a3db..e94b6d44 100644
--- a/one_click.py
+++ b/one_click.py
@@ -16,9 +16,9 @@ import sys
 
 
 # Define the required PyTorch version
-TORCH_VERSION = "2.2.1"
-TORCHVISION_VERSION = "0.17.1"
-TORCHAUDIO_VERSION = "2.2.1"
+TORCH_VERSION = "2.2.2"
+TORCHVISION_VERSION = "0.17.2"
+TORCHAUDIO_VERSION = "2.2.2"
 
 # Environment
 script_dir = os.getcwd()
@@ -315,7 +315,7 @@ def install_webui():
         run_cmd("conda install -y libuv")
 
     # Install the webui requirements
-    update_requirements(initial_installation=True)
+    update_requirements(initial_installation=True, pull=False)
 
 
 def get_extensions_names():
diff --git a/requirements.txt b/requirements.txt
index e0248384..14a8b8df 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,13 +1,13 @@
-accelerate==0.30.*
-aqlm[gpu,cpu]==1.1.5; platform_system == "Linux"
+accelerate==0.32.*
+aqlm[gpu,cpu]==1.1.6; platform_system == "Linux"
 auto-gptq==0.7.1
 bitsandbytes==0.43.*
 colorama
 datasets
 einops
 gradio==4.26.*
-hqq==0.1.7.post2
-jinja2==3.1.2
+hqq==0.1.7.post3
+jinja2==3.1.4
 lm_eval==0.3.0
 markdown
 numba==0.59.*
@@ -24,7 +24,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.41.*
+transformers==4.42.*
 tqdm
 wandb
 
@@ -37,31 +37,31 @@ soundfile
 openai-whisper
 
 # llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.82+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.82+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.82+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.82+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, no tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.75+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.75+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.75+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.75+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.82+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.82+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.82+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.82+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.75+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.75+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.75+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.75+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.82+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.82+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.82+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.82+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # CUDA wheels
-https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
-https://github.com/oobabooga/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2.0cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+cu121.torch2.2.2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+cu121.torch2.2.2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+cu121.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+cu121.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
+https://github.com/oobabooga/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu122torch2.2.2cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu122torch2.2.2cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu123torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu123torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 autoawq==0.2.5; platform_system == "Linux" or platform_system == "Windows"
diff --git a/requirements_amd.txt b/requirements_amd.txt
index 8489a97c..43412538 100644
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@@ -1,10 +1,10 @@
-accelerate==0.30.*
+accelerate==0.32.*
 colorama
 datasets
 einops
 gradio==4.26.*
-hqq==0.1.7.post2
-jinja2==3.1.2
+hqq==0.1.7.post3
+jinja2==3.1.4
 lm_eval==0.3.0
 markdown
 numba==0.59.*
@@ -21,7 +21,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.41.*
+transformers==4.42.*
 tqdm
 wandb
 
@@ -32,16 +32,16 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.82+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.82+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.82+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.82+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.75+rocm5.6.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.75+rocm5.6.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.82+rocm5.6.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.82+rocm5.6.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+rocm5.6.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+rocm5.6.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
 https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.5/autoawq-0.2.5+rocm561-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.5/autoawq-0.2.5+rocm561-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt
index 68a82f40..34cb4599 100644
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@@ -1,10 +1,10 @@
-accelerate==0.30.*
+accelerate==0.32.*
 colorama
 datasets
 einops
 gradio==4.26.*
-hqq==0.1.7.post2
-jinja2==3.1.2
+hqq==0.1.7.post3
+jinja2==3.1.4
 lm_eval==0.3.0
 markdown
 numba==0.59.*
@@ -21,7 +21,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.41.*
+transformers==4.42.*
 tqdm
 wandb
 
@@ -32,14 +32,14 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.82+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.82+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.82+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.82+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # AMD wheels
-https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+rocm5.6.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+rocm5.6.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
 https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.5/autoawq-0.2.5+rocm561-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.5/autoawq-0.2.5+rocm561-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt
index d3cecd08..1b170c27 100644
--- a/requirements_apple_intel.txt
+++ b/requirements_apple_intel.txt
@@ -1,10 +1,10 @@
-accelerate==0.30.*
+accelerate==0.32.*
 colorama
 datasets
 einops
 gradio==4.26.*
-hqq==0.1.7.post2
-jinja2==3.1.2
+hqq==0.1.7.post3
+jinja2==3.1.4
 lm_eval==0.3.0
 markdown
 numba==0.59.*
@@ -21,7 +21,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.41.*
+transformers==4.42.*
 tqdm
 wandb
 
@@ -32,10 +32,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp311-cp311-macosx_11_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp310-cp310-macosx_11_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp311-cp311-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp310-cp310-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20-py3-none-any.whl
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.82-cp311-cp311-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.82-cp310-cp310-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.82-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.82-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7-py3-none-any.whl
diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt
index c940cc52..d557eae1 100644
--- a/requirements_apple_silicon.txt
+++ b/requirements_apple_silicon.txt
@@ -1,10 +1,10 @@
-accelerate==0.30.*
+accelerate==0.32.*
 colorama
 datasets
 einops
 gradio==4.26.*
-hqq==0.1.7.post2
-jinja2==3.1.2
+hqq==0.1.7.post3
+jinja2==3.1.4
 lm_eval==0.3.0
 markdown
 numba==0.59.*
@@ -21,7 +21,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.41.*
+transformers==4.42.*
 tqdm
 wandb
 
@@ -32,12 +32,10 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp311-cp311-macosx_11_0_arm64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp310-cp310-macosx_11_0_arm64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp311-cp311-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp310-cp310-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.75-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20-py3-none-any.whl
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.82-cp311-cp311-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.82-cp310-cp310-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.82-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.82-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.82-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.82-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7-py3-none-any.whl
diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt
index 77420e6f..5aed6167 100644
--- a/requirements_cpu_only.txt
+++ b/requirements_cpu_only.txt
@@ -1,10 +1,10 @@
-accelerate==0.30.*
+accelerate==0.32.*
 colorama
 datasets
 einops
 gradio==4.26.*
-hqq==0.1.7.post2
-jinja2==3.1.2
+hqq==0.1.7.post3
+jinja2==3.1.4
 lm_eval==0.3.0
 markdown
 numba==0.59.*
@@ -21,7 +21,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.41.*
+transformers==4.42.*
 tqdm
 wandb
 
@@ -32,7 +32,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.82+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.82+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.82+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.82+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt
index 56bdf547..3937b002 100644
--- a/requirements_cpu_only_noavx2.txt
+++ b/requirements_cpu_only_noavx2.txt
@@ -1,10 +1,10 @@
-accelerate==0.30.*
+accelerate==0.32.*
 colorama
 datasets
 einops
 gradio==4.26.*
-hqq==0.1.7.post2
-jinja2==3.1.2
+hqq==0.1.7.post3
+jinja2==3.1.4
 lm_eval==0.3.0
 markdown
 numba==0.59.*
@@ -21,7 +21,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.41.*
+transformers==4.42.*
 tqdm
 wandb
 
@@ -32,7 +32,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.82+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.82+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.82+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.82+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index c110be62..15b45a05 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -1,13 +1,13 @@
-accelerate==0.30.*
-aqlm[gpu,cpu]==1.1.5; platform_system == "Linux"
+accelerate==0.32.*
+aqlm[gpu,cpu]==1.1.6; platform_system == "Linux"
 auto-gptq==0.7.1
 bitsandbytes==0.43.*
 colorama
 datasets
 einops
 gradio==4.26.*
-hqq==0.1.7.post2
-jinja2==3.1.2
+hqq==0.1.7.post3
+jinja2==3.1.4
 lm_eval==0.3.0
 markdown
 numba==0.59.*
@@ -24,7 +24,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.41.*
+transformers==4.42.*
 tqdm
 wandb
 
@@ -35,31 +35,31 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.75+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.82+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.82+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.82+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.82+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, no tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.75+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.75+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.75+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.75+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.82+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.82+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.82+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.82+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.75+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.75+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.75+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.75+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.82+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.82+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.82+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.82+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # CUDA wheels
-https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.0.20/exllamav2-0.0.20-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
-https://github.com/oobabooga/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2.0cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+cu121.torch2.2.2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+cu121.torch2.2.2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+cu121.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+cu121.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
+https://github.com/oobabooga/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu122torch2.2.2cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu122torch2.2.2cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu123torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu123torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 autoawq==0.2.5; platform_system == "Linux" or platform_system == "Windows"
diff --git a/requirements_nowheels.txt b/requirements_nowheels.txt
index 821049bc..14e3aa88 100644
--- a/requirements_nowheels.txt
+++ b/requirements_nowheels.txt
@@ -1,10 +1,10 @@
-accelerate==0.30.*
+accelerate==0.32.*
 colorama
 datasets
 einops
 gradio==4.26.*
-hqq==0.1.7.post2
-jinja2==3.1.2
+hqq==0.1.7.post3
+jinja2==3.1.4
 lm_eval==0.3.0
 markdown
 numba==0.59.*
@@ -21,7 +21,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.41.*
+transformers==4.42.*
 tqdm
 wandb
 
diff --git a/server.py b/server.py
index 04a5b16d..7afa954e 100644
--- a/server.py
+++ b/server.py
@@ -146,9 +146,9 @@ def create_interface():
         ui_model_menu.create_event_handlers()
 
         # Interface launch events
-        shared.gradio['interface'].load(lambda: None, None, None, js=f"() => {{if ({str(shared.settings['dark_theme']).lower()}) {{ document.getElementsByTagName('body')[0].classList.add('dark'); }} }}")
-        shared.gradio['interface'].load(lambda: None, None, None, js=f"() => {{{js}}}")
-        shared.gradio['interface'].load(lambda x: None, gradio('show_controls'), None, js=f'(x) => {{{ui.show_controls_js}; toggle_controls(x)}}')
+        shared.gradio['interface'].load(None, None, None, js=f"() => {{if ({str(shared.settings['dark_theme']).lower()}) {{ document.getElementsByTagName('body')[0].classList.add('dark'); }} }}")
+        shared.gradio['interface'].load(None, None, None, js=f"() => {{{js}}}")
+        shared.gradio['interface'].load(None, gradio('show_controls'), None, js=f'(x) => {{{ui.show_controls_js}; toggle_controls(x)}}')
         shared.gradio['interface'].load(partial(ui.apply_interface_values, {}, use_persistent=True), None, gradio(ui.list_interface_input_elements()), show_progress=False)
         shared.gradio['interface'].load(chat.redraw_html, gradio(ui_chat.reload_arr), gradio('display'))