diff --git a/css/html_instruct_style.css b/css/html_instruct_style.css
index bdf68aad..8a31d6e2 100644
--- a/css/html_instruct_style.css
+++ b/css/html_instruct_style.css
@@ -49,7 +49,7 @@
 
 .gradio-container .chat .assistant-message {
     padding: 20px;
-    background: var(--color-grey-200);
+    background: #f4f4f4;
     margin-top: 9px !important;
     margin-bottom: 12px !important;
     border-radius: 7px;
diff --git a/css/main.css b/css/main.css
index 498b3c6c..5768348e 100644
--- a/css/main.css
+++ b/css/main.css
@@ -95,7 +95,7 @@ gradio-app > :first-child {
 }
 
 .header_bar {
-    background-color: #f7f7f7;
+    background-color: #f4f4f4;
     box-shadow: 0 0 3px rgba(22 22 22 / 35%);
     margin-bottom: 0;
     overflow-x: scroll;
@@ -336,6 +336,11 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
         padding-left: 0;
         padding-right: 0;
     }
+
+    .chat {
+        padding-left: 0;
+        padding-right: 0;
+    }
 }
 
 .chat {
@@ -391,7 +396,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 
 .chat .message:last-child {
     margin-bottom: 0 !important;
-    padding-bottom: 0 !important;
+    padding-bottom: 15px !important;
 }
 
 .message-body li {
@@ -510,7 +515,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 #show-controls {
     position: absolute;
     height: 100%;
-    background-color: var(--background-fill-primary);
+    background-color: transparent;
     border: 0 !important;
     border-radius: 0;
 }
diff --git a/extensions/sd_api_pictures/script.py b/extensions/sd_api_pictures/script.py
index a16575cd..3a31771a 100644
--- a/extensions/sd_api_pictures/script.py
+++ b/extensions/sd_api_pictures/script.py
@@ -33,7 +33,7 @@ params = {
     'hr_upscaler': 'ESRGAN_4x',
     'hr_scale': '1.0',
     'seed': -1,
-    'sampler_name': 'DPM++ 2M Karras',
+    'sampler_name': 'DPM++ 2M',
     'steps': 32,
     'cfg_scale': 7,
     'textgen_prefix': 'Please provide a detailed and vivid description of [subject]',
diff --git a/extensions/whisper_stt/script.js b/extensions/whisper_stt/script.js
new file mode 100644
index 00000000..c4a908b5
--- /dev/null
+++ b/extensions/whisper_stt/script.js
@@ -0,0 +1,86 @@
+console.log("Whisper STT script loaded");
+
+let mediaRecorder;
+let audioChunks = [];
+let isRecording = false;
+
+window.startStopRecording = function() {
+  if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
+    console.error("getUserMedia not supported on your browser!");
+    return;
+  }
+
+  if (isRecording == false) {
+    //console.log("Start recording function called");
+    navigator.mediaDevices.getUserMedia({ audio: true })
+      .then(stream => {
+        //console.log("Got audio stream");
+        mediaRecorder = new MediaRecorder(stream);
+        audioChunks = []; // Reset audio chunks
+        mediaRecorder.start();
+        //console.log("MediaRecorder started");
+        recButton.icon;
+        recordButton.innerHTML = recButton.innerHTML = "Stop";
+        isRecording = true;
+
+        mediaRecorder.addEventListener("dataavailable", event => {
+          //console.log("Data available event, data size: ", event.data.size);
+          audioChunks.push(event.data);
+        });
+                
+        mediaRecorder.addEventListener("stop", () => {
+          //console.log("MediaRecorder stopped");
+          if (audioChunks.length > 0) {
+            const audioBlob = new Blob(audioChunks, { type: "audio/webm" });
+            //console.log("Audio blob created, size: ", audioBlob.size);
+            const reader = new FileReader();
+            reader.readAsDataURL(audioBlob);
+            reader.onloadend = function() {
+              const base64data = reader.result;
+              //console.log("Audio converted to base64, length: ", base64data.length);
+                            
+              const audioBase64Input = document.querySelector("#audio-base64 textarea");
+              if (audioBase64Input) {
+                audioBase64Input.value = base64data;
+                audioBase64Input.dispatchEvent(new Event("input", { bubbles: true }));
+                audioBase64Input.dispatchEvent(new Event("change", { bubbles: true }));
+                //console.log("Updated textarea with base64 data");
+              } else {
+                console.error("Could not find audio-base64 textarea");
+              }
+            };
+          } else {
+            console.error("No audio data recorded for Whisper");
+          }
+        });
+      });
+  } else {
+    //console.log("Stopping MediaRecorder");
+    recordButton.innerHTML = recButton.innerHTML = "Rec.";
+    isRecording = false;
+    mediaRecorder.stop();
+  }
+};
+
+const recordButton = gradioApp().querySelector("#record-button");
+recordButton.addEventListener("click", window.startStopRecording);
+
+
+function gradioApp() {
+  const elems = document.getElementsByTagName("gradio-app");
+  const gradioShadowRoot = elems.length == 0 ? null : elems[0].shadowRoot;
+  return gradioShadowRoot ? gradioShadowRoot : document;
+}
+
+
+// extra rec button next to generate button
+var recButton = recordButton.cloneNode(true);
+var generate_button = document.getElementById("Generate");
+generate_button.insertAdjacentElement("afterend", recButton);
+
+recButton.style.setProperty("margin-left", "-10px");
+recButton.innerHTML = "Rec.";
+
+recButton.addEventListener("click", function() {
+  recordButton.click();
+});
diff --git a/extensions/whisper_stt/script.py b/extensions/whisper_stt/script.py
index efc62f41..e45c8b1e 100644
--- a/extensions/whisper_stt/script.py
+++ b/extensions/whisper_stt/script.py
@@ -1,5 +1,13 @@
+import base64
+import gc
+import io
+from pathlib import Path
+
 import gradio as gr
-import speech_recognition as sr
+import numpy as np
+import torch
+import whisper
+from pydub import AudioSegment
 
 from modules import shared
 
@@ -8,13 +16,16 @@ input_hijack = {
     'value': ["", ""]
 }
 
-# parameters which can be customized in settings.json of webui
+# parameters which can be customized in settings.yaml of webui
 params = {
     'whipser_language': 'english',
     'whipser_model': 'small.en',
     'auto_submit': True
 }
 
+startup_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+WHISPERMODEL = whisper.load_model(params['whipser_model'], device=startup_device)
+
 
 def chat_input_modifier(text, visible_text, state):
     global input_hijack
@@ -25,47 +36,84 @@ def chat_input_modifier(text, visible_text, state):
         return text, visible_text
 
 
-def do_stt(audio, whipser_model, whipser_language):
-    transcription = ""
-    r = sr.Recognizer()
+def do_stt(audio, whipser_language):
+    # use pydub to convert sample_rate and sample_width for whisper input
+    dubaudio = AudioSegment.from_file(io.BytesIO(audio))
+    dubaudio = dubaudio.set_channels(1)
+    dubaudio = dubaudio.set_frame_rate(16000)
+    dubaudio = dubaudio.set_sample_width(2)
 
-    # Convert to AudioData
-    audio_data = sr.AudioData(sample_rate=audio[0], frame_data=audio[1], sample_width=4)
+    # same method to get the array as openai whisper repo used from wav file
+    audio_np = np.frombuffer(dubaudio.raw_data, np.int16).flatten().astype(np.float32) / 32768.0
 
-    try:
-        transcription = r.recognize_whisper(audio_data, language=whipser_language, model=whipser_model)
-    except sr.UnknownValueError:
-        print("Whisper could not understand audio")
-    except sr.RequestError as e:
-        print("Could not request results from Whisper", e)
+    if len(whipser_language) == 0:
+        result = WHISPERMODEL.transcribe(audio=audio_np)
+    else:
+        result = WHISPERMODEL.transcribe(audio=audio_np, language=whipser_language)
+    return result["text"]
 
+
+def auto_transcribe(audio, auto_submit, whipser_language):
+    if audio is None or audio == "":
+        print("Whisper received no audio data")
+        return "", ""
+    audio_bytes = base64.b64decode(audio.split(',')[1])
+
+    transcription = do_stt(audio_bytes, whipser_language)
+    if auto_submit:
+        input_hijack.update({"state": True, "value": [transcription, transcription]})
     return transcription
 
 
-def auto_transcribe(audio, auto_submit, whipser_model, whipser_language):
-    if audio is None:
-        return "", ""
-    transcription = do_stt(audio, whipser_model, whipser_language)
-    if auto_submit:
-        input_hijack.update({"state": True, "value": [transcription, transcription]})
+def reload_whispermodel(whisper_model_name: str, whisper_language: str, device: str):
+    if len(whisper_model_name) > 0:
+        global WHISPERMODEL
+        WHISPERMODEL = None
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        gc.collect()
 
-    return transcription, None
+        if device != "none":
+            if device == "cuda":
+                device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+            WHISPERMODEL = whisper.load_model(whisper_model_name, device=device)
+            params.update({"whipser_model": whisper_model_name})
+            if ".en" in whisper_model_name:
+                whisper_language = "english"
+            audio_update = gr.Audio.update(interactive=True)
+        else:
+            audio_update = gr.Audio.update(interactive=False)
+        return [whisper_model_name, whisper_language, str(device), audio_update]
 
 
 def ui():
     with gr.Accordion("Whisper STT", open=True):
         with gr.Row():
-            audio = gr.Audio(source="microphone")
+            audio = gr.Textbox(elem_id="audio-base64", visible=False)
+            record_button = gr.Button("Rec.", elem_id="record-button", elem_classes="custom-button")
         with gr.Row():
             with gr.Accordion("Settings", open=False):
                 auto_submit = gr.Checkbox(label='Submit the transcribed audio automatically', value=params['auto_submit'])
-                whipser_model = gr.Dropdown(label='Whisper Model', value=params['whipser_model'], choices=["tiny.en", "base.en", "small.en", "medium.en", "tiny", "base", "small", "medium", "large"])
-                whipser_language = gr.Dropdown(label='Whisper Language', value=params['whipser_language'], choices=["chinese", "german", "spanish", "russian", "korean", "french", "japanese", "portuguese", "turkish", "polish", "catalan", "dutch", "arabic", "swedish", "italian", "indonesian", "hindi", "finnish", "vietnamese", "hebrew", "ukrainian", "greek", "malay", "czech", "romanian", "danish", "hungarian", "tamil", "norwegian", "thai", "urdu", "croatian", "bulgarian", "lithuanian", "latin", "maori", "malayalam", "welsh", "slovak", "telugu", "persian", "latvian", "bengali", "serbian", "azerbaijani", "slovenian", "kannada", "estonian", "macedonian", "breton", "basque", "icelandic", "armenian", "nepali", "mongolian", "bosnian", "kazakh", "albanian", "swahili", "galician", "marathi", "punjabi", "sinhala", "khmer", "shona", "yoruba", "somali", "afrikaans", "occitan", "georgian", "belarusian", "tajik", "sindhi", "gujarati", "amharic", "yiddish", "lao", "uzbek", "faroese", "haitian creole", "pashto", "turkmen", "nynorsk", "maltese", "sanskrit", "luxembourgish", "myanmar", "tibetan", "tagalog", "malagasy", "assamese", "tatar", "hawaiian", "lingala", "hausa", "bashkir", "javanese", "sundanese"])
+                device_dropd = gr.Dropdown(label='Device', value=str(startup_device), choices=["cuda", "cpu", "none"])
+                whisper_model_dropd = gr.Dropdown(label='Whisper Model', value=params['whipser_model'], choices=["tiny.en", "base.en", "small.en", "medium.en", "tiny", "base", "small", "medium", "large"])
+                whisper_language = gr.Dropdown(label='Whisper Language', value=params['whipser_language'], choices=["english", "chinese", "german", "spanish", "russian", "korean", "french", "japanese", "portuguese", "turkish", "polish", "catalan", "dutch", "arabic", "swedish", "italian", "indonesian", "hindi", "finnish", "vietnamese", "hebrew", "ukrainian", "greek", "malay", "czech", "romanian", "danish", "hungarian", "tamil", "norwegian", "thai", "urdu", "croatian", "bulgarian", "lithuanian", "latin", "maori", "malayalam", "welsh", "slovak", "telugu", "persian", "latvian", "bengali", "serbian", "azerbaijani", "slovenian", "kannada", "estonian", "macedonian", "breton", "basque", "icelandic", "armenian", "nepali", "mongolian", "bosnian", "kazakh", "albanian", "swahili", "galician", "marathi", "punjabi", "sinhala", "khmer", "shona", "yoruba", "somali", "afrikaans", "occitan", "georgian", "belarusian", "tajik", "sindhi", "gujarati", "amharic", "yiddish", "lao", "uzbek", "faroese", "haitian creole", "pashto", "turkmen", "nynorsk", "maltese", "sanskrit", "luxembourgish", "myanmar", "tibetan", "tagalog", "malagasy", "assamese", "tatar", "hawaiian", "lingala", "hausa", "bashkir", "javanese", "sundanese"])
 
-    audio.stop_recording(
-        auto_transcribe, [audio, auto_submit, whipser_model, whipser_language], [shared.gradio['textbox'], audio]).then(
-        None, auto_submit, None, js="(check) => {if (check) { document.getElementById('Generate').click() }}")
+    audio.change(
+        auto_transcribe, [audio, auto_submit, whisper_language], [shared.gradio['textbox']]).then(
+        None, auto_submit, None, _js="(check) => {if (check) { document.getElementById('Generate').click() }}")
 
-    whipser_model.change(lambda x: params.update({"whipser_model": x}), whipser_model, None)
-    whipser_language.change(lambda x: params.update({"whipser_language": x}), whipser_language, None)
+    device_dropd.input(reload_whispermodel, [whisper_model_dropd, whisper_language, device_dropd], [whisper_model_dropd, whisper_language, device_dropd, audio])
+    whisper_model_dropd.change(reload_whispermodel, [whisper_model_dropd, whisper_language, device_dropd], [whisper_model_dropd, whisper_language, device_dropd, audio])
+    whisper_language.change(lambda x: params.update({"whipser_language": x}), whisper_language, None)
     auto_submit.change(lambda x: params.update({"auto_submit": x}), auto_submit, None)
+
+
+def custom_js():
+    """
+    Returns custom javascript as a string. It is applied whenever the web UI is
+    loaded.
+    :return:
+    """
+    with open(Path(__file__).parent.resolve() / "script.js", "r") as f:
+        return f.read()
diff --git a/js/main.js b/js/main.js
index e9a980e2..6b456517 100644
--- a/js/main.js
+++ b/js/main.js
@@ -7,30 +7,30 @@ main_parent.parentNode.style = "gap: 0";
 main_parent.parentNode.parentNode.style = "padding: 0";
 
 document.querySelector(".header_bar").addEventListener("click", function(event) {
-  if (event.target.tagName === "BUTTON") {
-    const buttonText = event.target.textContent.trim();
+  if (event.target.tagName !== "BUTTON") return;
 
-    let chat_visible = (buttonText == "Chat");
-    let default_visible = (buttonText == "Default");
-    let notebook_visible = (buttonText == "Notebook");
+  const buttonText = event.target.textContent.trim();
+  const extensionsVisible = ["Chat", "Default", "Notebook"].includes(buttonText);
+  const chatVisible = buttonText === "Chat";
+  const showControlsChecked = document.querySelector("#show-controls input").checked;
+  const extensions = document.querySelector("#extensions");
 
-    // Check if one of the generation tabs is visible
-    if (chat_visible || notebook_visible || default_visible) {
-      extensions && (extensions.style.display = "flex");
-
-      if (chat_visible) {
-        this.style.marginBottom = "0px";
-        extensions && (extensions.style.maxWidth = "880px");
-        extensions && (extensions.style.padding = "0px");
-      } else {
-        this.style.marginBottom = "19px";
-        extensions && (extensions.style.maxWidth = "none");
-        extensions && (extensions.style.padding = "15px");
-      }
-    } else {
-      this.style.marginBottom = "19px";
-      extensions && (extensions.style.display = "none");
+  if (extensionsVisible) {
+    if (extensions) {
+      extensions.style.display = "flex";
+      extensions.style.maxWidth = chatVisible ? "880px" : "none";
+      extensions.style.padding = chatVisible ? "0px" : "15px";
     }
+    this.style.marginBottom = chatVisible ? "0px" : "19px";
+
+    if (chatVisible && !showControlsChecked) {
+      document.querySelectorAll("#chat-tab > div > :nth-child(n+2), #extensions").forEach(element => {
+        element.style.display = "none";
+      });
+    }
+  } else {
+    this.style.marginBottom = "19px";
+    if (extensions) extensions.style.display = "none";
   }
 });
 
@@ -539,3 +539,64 @@ document.querySelectorAll(".focus-on-chat-input").forEach(element => {
 // Fix a border around the "past chats" menu
 //------------------------------------------------
 document.getElementById("past-chats").parentNode.style.borderRadius = "0px";
+
+//------------------------------------------------
+// Allow the character dropdown to coexist at the
+// Chat tab and the Parameters > Character tab
+//------------------------------------------------
+
+const headerBar = document.querySelector(".header_bar");
+let originalParent;
+let originalIndex; // To keep track of the original position
+let movedElement;
+
+function moveToChatTab() {
+  const characterMenu = document.getElementById("character-menu");
+  const grandParent = characterMenu.parentElement.parentElement;
+
+  // Save the initial location for the character dropdown
+  if (!originalParent) {
+    originalParent = grandParent.parentElement;
+    originalIndex = Array.from(originalParent.children).indexOf(grandParent);
+    movedElement = grandParent;
+  }
+
+  // Do not show the Character dropdown in the Chat tab when "instruct" mode is selected
+  const instructRadio = document.querySelector("#chat-mode input[value=\"instruct\"]");
+  if (instructRadio && instructRadio.checked) {
+    grandParent.style.display = "none";
+  }
+
+  const chatControlsFirstChild = document.querySelector("#chat-controls").firstElementChild;
+  const newParent = chatControlsFirstChild;
+  let newPosition = newParent.children.length - 2;
+
+  newParent.insertBefore(grandParent, newParent.children[newPosition]);
+  document.getElementById("save-character").style.display = "none";
+}
+
+function restoreOriginalPosition() {
+  if (originalParent && movedElement) {
+    if (originalIndex >= originalParent.children.length) {
+      originalParent.appendChild(movedElement);
+    } else {
+      originalParent.insertBefore(movedElement, originalParent.children[originalIndex]);
+    }
+
+    document.getElementById("save-character").style.display = "";
+    movedElement.style.display = "";
+  }
+}
+
+headerBar.addEventListener("click", (e) => {
+  if (e.target.tagName === "BUTTON") {
+    const tabName = e.target.textContent.trim();
+    if (tabName === "Chat") {
+      moveToChatTab();
+    } else {
+      restoreOriginalPosition();
+    }
+  }
+});
+
+moveToChatTab();
diff --git a/modules/chat.py b/modules/chat.py
index 5d2bdd63..920c0f7b 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -3,6 +3,7 @@ import copy
 import functools
 import html
 import json
+import pprint
 import re
 from datetime import datetime
 from functools import partial
@@ -259,10 +260,27 @@ def get_stopping_strings(state):
             suffix_bot + prefix_user,
         ]
 
+    # Try to find the EOT token
+    for item in stopping_strings.copy():
+        item = item.strip()
+        if item.startswith("<") and ">" in item:
+            stopping_strings.append(item.split(">")[0] + ">")
+        elif item.startswith("[") and "]" in item:
+            stopping_strings.append(item.split("]")[0] + "]")
+
     if 'stopping_strings' in state and isinstance(state['stopping_strings'], list):
         stopping_strings += state.pop('stopping_strings')
 
-    return list(set(stopping_strings))
+    # Remove redundant items that start with another item
+    result = [item for item in stopping_strings if not any(item.startswith(other) and item != other for other in stopping_strings)]
+    result = list(set(result))
+
+    if shared.args.verbose:
+        logger.info("STOPPING_STRINGS=")
+        pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(result)
+        print()
+
+    return result
 
 
 def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_message=True, for_ui=False):
diff --git a/modules/llama_cpp_python_hijack.py b/modules/llama_cpp_python_hijack.py
index eb23177f..f3f3f560 100644
--- a/modules/llama_cpp_python_hijack.py
+++ b/modules/llama_cpp_python_hijack.py
@@ -1,3 +1,4 @@
+import importlib
 from typing import Sequence
 
 from tqdm import tqdm
@@ -5,20 +6,55 @@ from tqdm import tqdm
 from modules import shared
 from modules.cache_utils import process_llamacpp_cache
 
-try:
-    import llama_cpp
-except:
-    llama_cpp = None
 
-try:
-    import llama_cpp_cuda
-except:
-    llama_cpp_cuda = None
+imported_module = None
 
-try:
-    import llama_cpp_cuda_tensorcores
-except:
-    llama_cpp_cuda_tensorcores = None
+
+def llama_cpp_lib():
+    global imported_module
+
+    return_lib = None
+
+    if shared.args.cpu:
+        if imported_module and imported_module != 'llama_cpp':
+            raise Exception(f"Cannot import 'llama_cpp' because '{imported_module}' is already imported. See issue #1575 in llama-cpp-python. Please restart the server before attempting to use a different version of llama-cpp-python.")
+        try:
+            return_lib = importlib.import_module('llama_cpp')
+            imported_module = 'llama_cpp'
+        except:
+            pass
+
+    if shared.args.tensorcores and return_lib is None:
+        if imported_module and imported_module != 'llama_cpp_cuda_tensorcores':
+            raise Exception(f"Cannot import 'llama_cpp_cuda_tensorcores' because '{imported_module}' is already imported. See issue #1575 in llama-cpp-python. Please restart the server before attempting to use a different version of llama-cpp-python.")
+        try:
+            return_lib = importlib.import_module('llama_cpp_cuda_tensorcores')
+            imported_module = 'llama_cpp_cuda_tensorcores'
+        except:
+            pass
+
+    if return_lib is None:
+        if imported_module and imported_module != 'llama_cpp_cuda':
+            raise Exception(f"Cannot import 'llama_cpp_cuda' because '{imported_module}' is already imported. See issue #1575 in llama-cpp-python. Please restart the server before attempting to use a different version of llama-cpp-python.")
+        try:
+            return_lib = importlib.import_module('llama_cpp_cuda')
+            imported_module = 'llama_cpp_cuda'
+        except:
+            pass
+
+    if return_lib is None and not shared.args.cpu:
+        if imported_module and imported_module != 'llama_cpp':
+            raise Exception(f"Cannot import 'llama_cpp' because '{imported_module}' is already imported. See issue #1575 in llama-cpp-python. Please restart the server before attempting to use a different version of llama-cpp-python.")
+        try:
+            return_lib = importlib.import_module('llama_cpp')
+            imported_module = 'llama_cpp'
+        except:
+            pass
+
+    if return_lib is not None:
+        monkey_patch_llama_cpp_python(return_lib)
+
+    return return_lib
 
 
 def eval_with_progress(self, tokens: Sequence[int]):
@@ -63,7 +99,7 @@ def eval_with_progress(self, tokens: Sequence[int]):
         self.n_tokens += n_tokens
 
 
-def monkey_patch_generate(lib):
+def monkey_patch_llama_cpp_python(lib):
 
     def my_generate(self, *args, **kwargs):
 
@@ -77,11 +113,6 @@ def monkey_patch_generate(lib):
         for output in self.original_generate(*args, **kwargs):
             yield output
 
+    lib.Llama.eval = eval_with_progress
     lib.Llama.original_generate = lib.Llama.generate
     lib.Llama.generate = my_generate
-
-
-for lib in [llama_cpp, llama_cpp_cuda, llama_cpp_cuda_tensorcores]:
-    if lib is not None:
-        lib.Llama.eval = eval_with_progress
-        monkey_patch_generate(lib)
diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py
index 74af5fbf..327e3a7b 100644
--- a/modules/llamacpp_hf.py
+++ b/modules/llamacpp_hf.py
@@ -7,35 +7,10 @@ from torch.nn import CrossEntropyLoss
 from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel
 from transformers.modeling_outputs import CausalLMOutputWithPast
 
-from modules import llama_cpp_python_hijack, shared
+from modules import shared
+from modules.llama_cpp_python_hijack import llama_cpp_lib
 from modules.logging_colors import logger
 
-try:
-    import llama_cpp
-except:
-    llama_cpp = None
-
-try:
-    import llama_cpp_cuda
-except:
-    llama_cpp_cuda = None
-
-try:
-    import llama_cpp_cuda_tensorcores
-except:
-    llama_cpp_cuda_tensorcores = None
-
-
-def llama_cpp_lib():
-    if shared.args.cpu and llama_cpp is not None:
-        return llama_cpp
-    elif shared.args.tensorcores and llama_cpp_cuda_tensorcores is not None:
-        return llama_cpp_cuda_tensorcores
-    elif llama_cpp_cuda is not None:
-        return llama_cpp_cuda
-    else:
-        return llama_cpp
-
 
 class LlamacppHF(PreTrainedModel):
     def __init__(self, model, path):
@@ -221,6 +196,13 @@ class LlamacppHF(PreTrainedModel):
             'flash_attn': shared.args.flash_attn
         }
 
+        if shared.args.cache_4bit:
+            params["type_k"] = 2
+            params["type_v"] = 2
+        elif shared.args.cache_8bit:
+            params["type_k"] = 8
+            params["type_v"] = 8
+
         Llama = llama_cpp_lib().Llama
         model = Llama(**params)
 
diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py
index d62fd517..a16230ca 100644
--- a/modules/llamacpp_model.py
+++ b/modules/llamacpp_model.py
@@ -4,37 +4,12 @@ from functools import partial
 import numpy as np
 import torch
 
-from modules import llama_cpp_python_hijack, shared
+from modules import shared
 from modules.callbacks import Iteratorize
+from modules.llama_cpp_python_hijack import llama_cpp_lib
 from modules.logging_colors import logger
 from modules.text_generation import get_max_prompt_length
 
-try:
-    import llama_cpp
-except:
-    llama_cpp = None
-
-try:
-    import llama_cpp_cuda
-except:
-    llama_cpp_cuda = None
-
-try:
-    import llama_cpp_cuda_tensorcores
-except:
-    llama_cpp_cuda_tensorcores = None
-
-
-def llama_cpp_lib():
-    if shared.args.cpu and llama_cpp is not None:
-        return llama_cpp
-    elif shared.args.tensorcores and llama_cpp_cuda_tensorcores is not None:
-        return llama_cpp_cuda_tensorcores
-    elif llama_cpp_cuda is not None:
-        return llama_cpp_cuda
-    else:
-        return llama_cpp
-
 
 def ban_eos_logits_processor(eos_token, input_ids, logits):
     logits[eos_token] = -float('inf')
@@ -100,6 +75,13 @@ class LlamaCppModel:
             'flash_attn': shared.args.flash_attn
         }
 
+        if shared.args.cache_4bit:
+            params["type_k"] = 2
+            params["type_v"] = 2
+        elif shared.args.cache_8bit:
+            params["type_k"] = 8
+            params["type_v"] = 8
+
         result.model = Llama(**params)
         if cache_capacity > 0:
             result.model.set_cache(LlamaCache(capacity_bytes=cache_capacity))
diff --git a/modules/loaders.py b/modules/loaders.py
index 1da37595..78601c17 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -21,6 +21,7 @@ loaders_and_params = OrderedDict({
         'trust_remote_code',
         'no_use_fast',
         'use_flash_attention_2',
+        'use_eager_attention',
         'alpha_value',
         'compress_pos_emb',
         'disable_exllama',
@@ -30,6 +31,8 @@ loaders_and_params = OrderedDict({
     'llama.cpp': [
         'n_ctx',
         'n_gpu_layers',
+        'cache_8bit',
+        'cache_4bit',
         'tensor_split',
         'n_batch',
         'threads',
@@ -51,6 +54,8 @@ loaders_and_params = OrderedDict({
     'llamacpp_HF': [
         'n_ctx',
         'n_gpu_layers',
+        'cache_8bit',
+        'cache_4bit',
         'tensor_split',
         'n_batch',
         'threads',
diff --git a/modules/models.py b/modules/models.py
index da741cb0..07c14308 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -146,6 +146,9 @@ def huggingface_loader(model_name):
     if shared.args.force_safetensors:
         params['force_safetensors'] = True
 
+    if shared.args.use_eager_attention:
+        params['attn_implementation'] = 'eager'
+
     config = AutoConfig.from_pretrained(path_to_model, trust_remote_code=shared.args.trust_remote_code)
 
     if 'chatglm' in model_name.lower():
diff --git a/modules/models_settings.py b/modules/models_settings.py
index 2e3fff9c..7ae68125 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -9,6 +9,8 @@ from modules import chat, loaders, metadata_gguf, shared, ui
 
 def get_fallback_settings():
     return {
+        'bf16': False,
+        'use_eager_attention': False,
         'wbits': 'None',
         'groupsize': 'None',
         'desc_act': False,
@@ -97,10 +99,18 @@ def get_model_metadata(model):
             elif 'attn_config' in metadata and 'rope_theta' in metadata['attn_config']:
                 model_settings['rope_freq_base'] = metadata['attn_config']['rope_theta']
 
-            if 'rope_scaling' in metadata and type(metadata['rope_scaling']) is dict and all(key in metadata['rope_scaling'] for key in ('type', 'factor')):
+            if 'rope_scaling' in metadata and isinstance(metadata['rope_scaling'], dict) and all(key in metadata['rope_scaling'] for key in ('type', 'factor')):
                 if metadata['rope_scaling']['type'] == 'linear':
                     model_settings['compress_pos_emb'] = metadata['rope_scaling']['factor']
 
+            # For Gemma-2
+            if 'torch_dtype' in metadata and metadata['torch_dtype'] == 'bfloat16':
+                model_settings['bf16'] = True
+
+            # For Gemma-2
+            if 'architectures' in metadata and isinstance(metadata['architectures'], list) and 'Gemma2ForCausalLM' in metadata['architectures']:
+                model_settings['use_eager_attention'] = True
+
             # Read GPTQ metadata for old GPTQ loaders
             if 'quantization_config' in metadata and metadata['quantization_config'].get('quant_method', '') != 'exl2':
                 if 'bits' in metadata['quantization_config']:
@@ -133,7 +143,7 @@ def get_model_metadata(model):
             for k in ['eos_token', 'bos_token']:
                 if k in metadata:
                     value = metadata[k]
-                    if type(value) is dict:
+                    if isinstance(value, dict):
                         value = value['content']
 
                     template = template.replace(k, "'{}'".format(value))
@@ -168,7 +178,7 @@ def infer_loader(model_name, model_settings):
     path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
     if not path_to_model.exists():
         loader = None
-    elif (path_to_model / 'quantize_config.json').exists() or ('wbits' in model_settings and type(model_settings['wbits']) is int and model_settings['wbits'] > 0):
+    elif (path_to_model / 'quantize_config.json').exists() or ('wbits' in model_settings and isinstance(model_settings['wbits'], int) and model_settings['wbits'] > 0):
         loader = 'ExLlamav2_HF'
     elif (path_to_model / 'quant_config.json').exists() or re.match(r'.*-awq', model_name.lower()):
         loader = 'AutoAWQ'
diff --git a/modules/sampler_hijack.py b/modules/sampler_hijack.py
index ad74d658..9fb661ae 100644
--- a/modules/sampler_hijack.py
+++ b/modules/sampler_hijack.py
@@ -359,14 +359,14 @@ class RepetitionPenaltyLogitsProcessorWithRange(LogitsProcessor):
         return scores
 
 
-def get_logits_warper_patch(self, generation_config):
+def get_logits_warper_patch(self, generation_config, **kwargs):
 
     # Parameter sanitization
     if isinstance(generation_config.temperature, int):
         generation_config.temperature = float(generation_config.temperature)  # Must be float
 
     # Get the original warpers
-    warpers = self._get_logits_warper_old(generation_config)
+    warpers = self._get_logits_warper_old(generation_config, **kwargs)
 
     # Replace temperature with our modified class.
     # Currently, it behaves identically to the original.
diff --git a/modules/shared.py b/modules/shared.py
index ebbfc268..e04c549a 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -106,6 +106,7 @@ group.add_argument('--trust-remote-code', action='store_true', help='Set trust_r
 group.add_argument('--force-safetensors', action='store_true', help='Set use_safetensors=True while loading the model. This prevents arbitrary code execution.')
 group.add_argument('--no_use_fast', action='store_true', help='Set use_fast=False while loading the tokenizer (it\'s True by default). Use this if you have any problems related to use_fast.')
 group.add_argument('--use_flash_attention_2', action='store_true', help='Set use_flash_attention_2=True while loading the model.')
+group.add_argument('--use_eager_attention', action='store_true', help='Set attn_implementation= eager while loading the model.')
 
 # bitsandbytes 4-bit
 group = parser.add_argument_group('bitsandbytes 4-bit')
diff --git a/modules/ui.py b/modules/ui.py
index c20a7888..b1c1cf6d 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -43,6 +43,11 @@ theme = gr.themes.Default(
     body_text_color_subdued='#484848',
     background_fill_secondary='#eaeaea',
     background_fill_primary='var(--neutral-50)',
+    body_background_fill="white",
+    block_background_fill="#f4f4f4",
+    body_text_color="#333",
+    button_secondary_background_fill="#f4f4f4",
+    button_secondary_border_color="var(--border-color-primary)"
 )
 
 if Path("notification.mp3").exists():
@@ -64,6 +69,7 @@ def list_model_elements():
         'trust_remote_code',
         'no_use_fast',
         'use_flash_attention_2',
+        'use_eager_attention',
         'load_in_4bit',
         'compute_dtype',
         'quant_type',
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 91951624..6942588e 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -87,16 +87,11 @@ def create_ui():
                 with gr.Row():
                     shared.gradio['mode'] = gr.Radio(choices=['chat', 'chat-instruct', 'instruct'], label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template selected under Parameters > Instruction template must match the current model.', elem_id='chat-mode')
 
-                with gr.Row():
-                    shared.gradio['character_menu'] = gr.Dropdown(value=None, choices=utils.get_available_characters(), label='Character', elem_id='character-menu', elem_classes='slim-dropdown')
-                    shared.gradio['refresh_character'] = ui.create_refresh_button(shared.gradio['character_menu'], lambda: None, lambda: {'choices': utils.get_available_characters()}, 'refresh-button', interactive=not mu)
-                    shared.gradio['delete_character'] = gr.Button('🗑️', elem_classes='refresh-button', interactive=not mu)
-
                 with gr.Row():
                     shared.gradio['chat_style'] = gr.Dropdown(choices=utils.get_available_chat_styles(), label='Chat style', value=shared.settings['chat_style'], visible=shared.settings['mode'] != 'instruct')
 
                 with gr.Row():
-                    shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=16, label='Command for chat-instruct mode', info='<|character|> and <|prompt|> get replaced with the bot name and the regular chat prompt respectively.', visible=False, elem_classes=['add_scrollbar'])
+                    shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=12, label='Command for chat-instruct mode', info='<|character|> and <|prompt|> get replaced with the bot name and the regular chat prompt respectively.', visible=False, elem_classes=['add_scrollbar'])
 
 
 def create_chat_settings_ui():
@@ -105,10 +100,15 @@ def create_chat_settings_ui():
         with gr.Row():
             with gr.Column(scale=8):
                 with gr.Tab("Character"):
+                    with gr.Row():
+                        shared.gradio['character_menu'] = gr.Dropdown(value=None, choices=utils.get_available_characters(), label='Character', elem_id='character-menu', info='Used in chat and chat-instruct modes.', elem_classes='slim-dropdown')
+                        ui.create_refresh_button(shared.gradio['character_menu'], lambda: None, lambda: {'choices': utils.get_available_characters()}, 'refresh-button', interactive=not mu)
+                        shared.gradio['save_character'] = gr.Button('💾', elem_classes='refresh-button', elem_id="save-character", interactive=not mu)
+                        shared.gradio['delete_character'] = gr.Button('🗑️', elem_classes='refresh-button', interactive=not mu)
+
                     shared.gradio['name2'] = gr.Textbox(value='', lines=1, label='Character\'s name')
                     shared.gradio['context'] = gr.Textbox(value='', lines=10, label='Context', elem_classes=['add_scrollbar'])
                     shared.gradio['greeting'] = gr.Textbox(value='', lines=5, label='Greeting', elem_classes=['add_scrollbar'])
-                    shared.gradio['save_character'] = gr.Button('Save character', elem_classes=['small-button'], interactive=not mu)
 
                 with gr.Tab("User"):
                     shared.gradio['name1'] = gr.Textbox(value=shared.settings['name1'], lines=1, label='Name')
@@ -300,8 +300,10 @@ def create_event_handlers():
         lambda x: gr.update(choices=(histories := chat.find_all_histories_with_first_prompts(x)), value=histories[0][1]), gradio('interface_state'), gradio('unique_id'), show_progress=False).then(
         None, None, None, js=f'() => {{{ui.update_big_picture_js}; updateBigPicture()}}')
 
+    shared.gradio['mode'].change(None, gradio('mode'), None, js="(mode) => {mode === 'instruct' ? document.getElementById('character-menu').parentNode.parentNode.style.display = 'none' : document.getElementById('character-menu').parentNode.parentNode.style.display = ''}")
+
     shared.gradio['mode'].change(
-        lambda x: [gr.update(visible=(x != 'instruct'))] * 4 + [gr.update(visible=(x == 'chat-instruct'))], gradio('mode'), gradio('character_menu', 'refresh_character', 'delete_character', 'chat_style', 'chat-instruct_command'), show_progress=False).then(
+        lambda x: [gr.update(visible=x != 'instruct'), gr.update(visible=x == 'chat-instruct')], gradio('mode'), gradio('chat_style', 'chat-instruct_command'), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.load_latest_history, gradio('interface_state'), gradio('history')).then(
         chat.redraw_html, gradio(reload_arr), gradio('display')).then(
diff --git a/modules/ui_default.py b/modules/ui_default.py
index bf9800f6..e3bfe784 100644
--- a/modules/ui_default.py
+++ b/modules/ui_default.py
@@ -16,7 +16,6 @@ outputs = ('output_textbox', 'html-default')
 def create_ui():
     mu = shared.args.multi_user
     with gr.Tab('Default', elem_id='default-tab'):
-        shared.gradio['last_input-default'] = gr.State('')
         with gr.Row():
             with gr.Column():
                 with gr.Row():
@@ -63,14 +62,12 @@ def create_ui():
 
 def create_event_handlers():
     shared.gradio['Generate-default'].click(
-        lambda x: x, gradio('textbox-default'), gradio('last_input-default')).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['textbox-default'].submit(
-        lambda x: x, gradio('textbox-default'), gradio('last_input-default')).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 3ebcd126..9a4e7351 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -115,6 +115,7 @@ def create_ui():
                             shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
                             shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant)
                             shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.')
+                            shared.gradio['use_eager_attention'] = gr.Checkbox(label="use_eager_attention", value=shared.args.use_eager_attention, info='Set attn_implementation= eager while loading the model.')
                             shared.gradio['flash_attn'] = gr.Checkbox(label="flash_attn", value=shared.args.flash_attn, info='Use flash-attention.')
                             shared.gradio['auto_devices'] = gr.Checkbox(label="auto-devices", value=shared.args.auto_devices)
                             shared.gradio['tensorcores'] = gr.Checkbox(label="tensorcores", value=shared.args.tensorcores, info='NVIDIA only: use llama-cpp-python compiled with tensor cores support. This increases performance on RTX cards.')
diff --git a/requirements.txt b/requirements.txt
index fb35c7d8..22ae8bed 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
-accelerate==0.30.*
-aqlm[gpu,cpu]==1.1.5; platform_system == "Linux"
+accelerate==0.31.*
+aqlm[gpu,cpu]==1.1.6; platform_system == "Linux"
 auto-gptq==0.7.1
 bitsandbytes==0.43.*
 colorama
@@ -7,7 +7,7 @@ datasets
 einops
 gradio==4.26.*
 hqq==0.1.7.post3
-jinja2==3.1.2
+jinja2==3.1.4
 lm_eval==0.3.0
 markdown
 numba==0.59.*
@@ -24,7 +24,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.41.*
+transformers==4.42.*
 tqdm
 wandb
 
@@ -35,22 +35,22 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.81+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.81+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.81+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.81+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, no tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.79+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.79+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.79+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.79+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.81+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.81+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.81+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.81+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.79+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.79+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.79+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.79+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.81+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.81+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.81+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.81+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # CUDA wheels
 https://github.com/oobabooga/exllamav2/releases/download/v0.1.6/exllamav2-0.1.6+cu121.torch2.2.2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements_amd.txt b/requirements_amd.txt
index 5bb68522..464a09f4 100644
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@@ -1,10 +1,10 @@
-accelerate==0.30.*
+accelerate==0.31.*
 colorama
 datasets
 einops
 gradio==4.26.*
 hqq==0.1.7.post3
-jinja2==3.1.2
+jinja2==3.1.4
 lm_eval==0.3.0
 markdown
 numba==0.59.*
@@ -21,7 +21,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.41.*
+transformers==4.42.*
 tqdm
 wandb
 
@@ -32,14 +32,14 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.81+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.81+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.81+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.81+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.79+rocm5.6.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.79+rocm5.6.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.81+rocm5.6.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.81+rocm5.6.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.1.6/exllamav2-0.1.6+rocm5.6.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.1.6/exllamav2-0.1.6+rocm5.6.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.1.6/exllamav2-0.1.6-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt
index eee2c662..9f700b4b 100644
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@@ -1,10 +1,10 @@
-accelerate==0.30.*
+accelerate==0.31.*
 colorama
 datasets
 einops
 gradio==4.26.*
 hqq==0.1.7.post3
-jinja2==3.1.2
+jinja2==3.1.4
 lm_eval==0.3.0
 markdown
 numba==0.59.*
@@ -21,7 +21,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.41.*
+transformers==4.42.*
 tqdm
 wandb
 
@@ -32,10 +32,10 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.81+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.81+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.81+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.81+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # AMD wheels
 https://github.com/oobabooga/exllamav2/releases/download/v0.1.6/exllamav2-0.1.6+rocm5.6.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt
index 61e3c47c..ef9a6d61 100644
--- a/requirements_apple_intel.txt
+++ b/requirements_apple_intel.txt
@@ -1,10 +1,10 @@
-accelerate==0.30.*
+accelerate==0.31.*
 colorama
 datasets
 einops
 gradio==4.26.*
 hqq==0.1.7.post3
-jinja2==3.1.2
+jinja2==3.1.4
 lm_eval==0.3.0
 markdown
 numba==0.59.*
@@ -21,7 +21,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.41.*
+transformers==4.42.*
 tqdm
 wandb
 
@@ -32,10 +32,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.79-cp311-cp311-macosx_11_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.79-cp310-cp310-macosx_11_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.79-cp311-cp311-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.79-cp310-cp310-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.79-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.79-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.81-cp311-cp311-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.81-cp310-cp310-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.81-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.81-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.1.6/exllamav2-0.1.6-py3-none-any.whl
diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt
index b9497470..0e4574ee 100644
--- a/requirements_apple_silicon.txt
+++ b/requirements_apple_silicon.txt
@@ -1,10 +1,10 @@
-accelerate==0.30.*
+accelerate==0.31.*
 colorama
 datasets
 einops
 gradio==4.26.*
 hqq==0.1.7.post3
-jinja2==3.1.2
+jinja2==3.1.4
 lm_eval==0.3.0
 markdown
 numba==0.59.*
@@ -21,7 +21,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.41.*
+transformers==4.42.*
 tqdm
 wandb
 
@@ -32,12 +32,10 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.79-cp311-cp311-macosx_11_0_arm64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.79-cp310-cp310-macosx_11_0_arm64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.79-cp311-cp311-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.79-cp310-cp310-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.79-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.79-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.79-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.79-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.81-cp311-cp311-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.81-cp310-cp310-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.81-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.81-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.81-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.81-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.1.6/exllamav2-0.1.6-py3-none-any.whl
diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt
index ad38f23d..37a2d102 100644
--- a/requirements_cpu_only.txt
+++ b/requirements_cpu_only.txt
@@ -1,10 +1,10 @@
-accelerate==0.30.*
+accelerate==0.31.*
 colorama
 datasets
 einops
 gradio==4.26.*
 hqq==0.1.7.post3
-jinja2==3.1.2
+jinja2==3.1.4
 lm_eval==0.3.0
 markdown
 numba==0.59.*
@@ -21,7 +21,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.41.*
+transformers==4.42.*
 tqdm
 wandb
 
@@ -32,7 +32,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.81+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.81+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.81+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.81+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt
index 0debda88..d2cc0cbe 100644
--- a/requirements_cpu_only_noavx2.txt
+++ b/requirements_cpu_only_noavx2.txt
@@ -1,10 +1,10 @@
-accelerate==0.30.*
+accelerate==0.31.*
 colorama
 datasets
 einops
 gradio==4.26.*
 hqq==0.1.7.post3
-jinja2==3.1.2
+jinja2==3.1.4
 lm_eval==0.3.0
 markdown
 numba==0.59.*
@@ -21,7 +21,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.41.*
+transformers==4.42.*
 tqdm
 wandb
 
@@ -32,7 +32,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.81+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.81+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.81+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.81+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index 78342a2e..6742f2e5 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -1,5 +1,5 @@
-accelerate==0.30.*
-aqlm[gpu,cpu]==1.1.5; platform_system == "Linux"
+accelerate==0.31.*
+aqlm[gpu,cpu]==1.1.6; platform_system == "Linux"
 auto-gptq==0.7.1
 bitsandbytes==0.43.*
 colorama
@@ -7,7 +7,7 @@ datasets
 einops
 gradio==4.26.*
 hqq==0.1.7.post3
-jinja2==3.1.2
+jinja2==3.1.4
 lm_eval==0.3.0
 markdown
 numba==0.59.*
@@ -24,7 +24,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.41.*
+transformers==4.42.*
 tqdm
 wandb
 
@@ -35,22 +35,22 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.79+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.81+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.81+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.81+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.81+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, no tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.79+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.79+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.79+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.79+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.81+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.81+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.81+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.81+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.79+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.79+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.79+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.79+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.81+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.81+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.81+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.81+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # CUDA wheels
 https://github.com/oobabooga/exllamav2/releases/download/v0.1.6/exllamav2-0.1.6+cu121.torch2.2.2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements_nowheels.txt b/requirements_nowheels.txt
index e96d468f..21025a62 100644
--- a/requirements_nowheels.txt
+++ b/requirements_nowheels.txt
@@ -1,10 +1,10 @@
-accelerate==0.30.*
+accelerate==0.31.*
 colorama
 datasets
 einops
 gradio==4.26.*
 hqq==0.1.7.post3
-jinja2==3.1.2
+jinja2==3.1.4
 lm_eval==0.3.0
 markdown
 numba==0.59.*
@@ -21,7 +21,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.41.*
+transformers==4.42.*
 tqdm
 wandb