diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml
index 7fd11138..e2e16212 100644
--- a/.github/FUNDING.yml
+++ b/.github/FUNDING.yml
@@ -1,2 +1 @@
-github: oobabooga
-ko_fi: oobabooga
+patreon: oobabooga
diff --git a/README.md b/README.md
index 22ceeeb8..4b22f8d5 100644
--- a/README.md
+++ b/README.md
@@ -4,39 +4,35 @@ A Gradio web UI for Large Language Models.
 
 Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) of text generation.
 
-|![Image1](https://github.com/oobabooga/screenshots/raw/main/print_instruct.png) | ![Image2](https://github.com/oobabooga/screenshots/raw/main/print_chat.png) |
+|![Image1](https://github.com/oobabooga/screenshots/raw/main/AFTER-INSTRUCT.png) | ![Image2](https://github.com/oobabooga/screenshots/raw/main/AFTER-CHAT.png) |
 |:---:|:---:|
-|![Image1](https://github.com/oobabooga/screenshots/raw/main/print_default.png) | ![Image2](https://github.com/oobabooga/screenshots/raw/main/print_parameters.png) |
+|![Image1](https://github.com/oobabooga/screenshots/raw/main/AFTER-DEFAULT.png) | ![Image2](https://github.com/oobabooga/screenshots/raw/main/AFTER-PARAMETERS.png) |
 
 ## Features
 
-* Multiple backends for text generation in a single UI and API, including [Transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp) (through [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)), [ExLlamaV2](https://github.com/turboderp/exllamav2), [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM). [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [HQQ](https://github.com/mobiusml/hqq), and [AQLM](https://github.com/Vahe1994/AQLM) are also supported through the Transformers loader.
-* OpenAI-compatible API server with Chat and Completions endpoints – see the [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples).
-* Automatic prompt formatting for each model using the Jinja2 template in its metadata.
-* Three chat modes: `instruct`, `chat-instruct`, and `chat`, allowing for both instruction-following and casual conversations with characters. `chat-instruct` mode automatically applies the model's template to the chat prompt, ensuring high-quality outputs without manual setup.
-* "Past chats" menu to quickly switch between conversations and start new ones.
-* Free-form generation in the Default/Notebook tabs without being limited to chat turns. Send formatted chat conversations from the Chat tab to these tabs.
-* Multiple sampling parameters and generation options for sophisticated text generation control.
-* Easy switching between different models through the UI without restarting, using the "Model" tab.
-* Simple LoRA fine-tuning tool to customize models with your data.
-* All in one folder. The requirements are installed in a self-contained `installer_files` folder that doesn't interfere with the system's environment.
-* Extensions support, including numerous built-in and user-contributed extensions. See [the wiki](https://github.com/oobabooga/text-generation-webui/wiki/07-%E2%80%90-Extensions) and [the extensions directory](https://github.com/oobabooga/text-generation-webui-extensions) for details.
+- Supports multiple text generation backends in one UI/API, including [Transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp), and [ExLlamaV2](https://github.com/turboderp-org/exllamav2). [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) is supported via its own [Dockerfile](https://github.com/oobabooga/text-generation-webui/blob/main/docker/TensorRT-LLM/Dockerfile), and the Transformers loader is compatible with libraries like [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [HQQ](https://github.com/mobiusml/hqq), and [AQLM](https://github.com/Vahe1994/AQLM), but they must be installed manually.
+- OpenAI-compatible API with Chat and Completions endpoints – see [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples).
+- Automatic prompt formatting using Jinja2 templates.
+- Three chat modes: `instruct`, `chat-instruct`, and `chat`, with automatic prompt templates in `chat-instruct`.
+- "Past chats" menu to quickly switch between conversations.
+- Free-form text generation in the Default/Notebook tabs without being limited to chat turns. You can send formatted conversations from the Chat tab to these.
+- Multiple sampling parameters and generation options for sophisticated text generation control.
+- Switch between different models easily in the UI without restarting.
+- Simple LoRA fine-tuning tool.
+- Requirements installed in a self-contained `installer_files` directory that doesn't interfere with the system environment.
+- Extension support, with numerous built-in and user-contributed extensions available. See the [wiki](https://github.com/oobabooga/text-generation-webui/wiki/07-%E2%80%90-Extensions) and [extensions directory](https://github.com/oobabooga/text-generation-webui-extensions) for details.
 
 ## How to install
 
-1) Clone or [download](https://github.com/oobabooga/text-generation-webui/archive/refs/heads/main.zip) the repository.
-2) Run the `start_linux.sh`, `start_windows.bat`, `start_macos.sh`, or `start_wsl.bat` script depending on your OS.
+1) Clone or [download the repository](https://github.com/oobabooga/text-generation-webui/archive/refs/heads/main.zip).
+2) Run the script that matches your OS: `start_linux.sh`, `start_windows.bat`, `start_macos.sh`, or `start_wsl.bat`.
 3) Select your GPU vendor when asked.
 4) Once the installation ends, browse to `http://localhost:7860`.
 5) Have fun!
 
-To restart the web UI in the future, run the `start_` script again.
+To restart the web UI later, just run the same `start_` script. If you need to reinstall, delete the `installer_files` folder created during setup and run the script again.
 
-This script creates an `installer_files` folder where it sets up the project's requirements. If you need to reinstall the requirements, just delete that folder and start the web UI again.
-
-The script accepts command-line flags, such as `./start_linux.sh --help`. Alternatively, you can edit the `CMD_FLAGS.txt` file with a text editor and add your flags there, such as `--api` in case you need to use the API.
-
-To get updates in the future, run `update_wizard_linux.sh`, `update_wizard_windows.bat`, `update_wizard_macos.sh`, or `update_wizard_wsl.bat`.
+You can use command-line flags, like `./start_linux.sh --help`, or add them to `CMD_FLAGS.txt` (such as `--api` to enable API use). To update the project, run `update_wizard_linux.sh`, `update_wizard_windows.bat`, `update_wizard_macos.sh`, or `update_wizard_wsl.bat`.
 
 <details>
 <summary>
@@ -80,12 +76,12 @@ conda activate textgen
 
 | System | GPU | Command |
 |--------|---------|---------|
-| Linux/WSL | NVIDIA | `pip3 install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121` |
-| Linux/WSL | CPU only | `pip3 install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cpu` |
-| Linux | AMD | `pip3 install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/rocm5.6` |
-| MacOS + MPS | Any | `pip3 install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2` |
-| Windows | NVIDIA | `pip3 install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121` |
-| Windows | CPU only | `pip3 install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2` |
+| Linux/WSL | NVIDIA | `pip3 install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cu121` |
+| Linux/WSL | CPU only | `pip3 install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cpu` |
+| Linux | AMD | `pip3 install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/rocm6.1` |
+| MacOS + MPS | Any | `pip3 install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1` |
+| Windows | NVIDIA | `pip3 install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cu121` |
+| Windows | CPU only | `pip3 install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1` |
 
 The up-to-date commands can be found here: https://pytorch.org/get-started/locally/.
 
@@ -150,7 +146,7 @@ Then browse to
 1) For Kepler GPUs and older, you will need to install CUDA 11.8 instead of 12:
 
 ```
-pip3 install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu118
+pip3 install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cu118
 conda install -y -c "nvidia/label/cuda-11.8.0" cuda-runtime
 ```
 
@@ -206,18 +202,18 @@ List of command-line flags
 
 ```txt
 usage: server.py [-h] [--multi-user] [--character CHARACTER] [--model MODEL] [--lora LORA [LORA ...]] [--model-dir MODEL_DIR] [--lora-dir LORA_DIR] [--model-menu] [--settings SETTINGS]
-                 [--extensions EXTENSIONS [EXTENSIONS ...]] [--verbose] [--chat-buttons] [--idle-timeout IDLE_TIMEOUT] [--loader LOADER] [--cpu] [--auto-devices]
-                 [--gpu-memory GPU_MEMORY [GPU_MEMORY ...]] [--cpu-memory CPU_MEMORY] [--disk] [--disk-cache-dir DISK_CACHE_DIR] [--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code]
-                 [--force-safetensors] [--no_use_fast] [--use_flash_attention_2] [--use_eager_attention] [--load-in-4bit] [--use_double_quant] [--compute_dtype COMPUTE_DTYPE] [--quant_type QUANT_TYPE]
-                 [--flash-attn] [--tensorcores] [--n_ctx N_CTX] [--threads THREADS] [--threads-batch THREADS_BATCH] [--no_mul_mat_q] [--n_batch N_BATCH] [--no-mmap] [--mlock]
-                 [--n-gpu-layers N_GPU_LAYERS] [--tensor_split TENSOR_SPLIT] [--numa] [--logits_all] [--no_offload_kqv] [--cache-capacity CACHE_CAPACITY] [--row_split] [--streaming-llm]
-                 [--attention-sink-size ATTENTION_SINK_SIZE] [--tokenizer-dir TOKENIZER_DIR] [--gpu-split GPU_SPLIT] [--autosplit] [--max_seq_len MAX_SEQ_LEN] [--cfg-cache] [--no_flash_attn]
-                 [--no_xformers] [--no_sdpa] [--cache_8bit] [--cache_4bit] [--num_experts_per_token NUM_EXPERTS_PER_TOKEN] [--triton] [--no_inject_fused_mlp] [--no_use_cuda_fp16] [--desc_act]
-                 [--disable_exllama] [--disable_exllamav2] [--wbits WBITS] [--groupsize GROUPSIZE] [--hqq-backend HQQ_BACKEND] [--cpp-runner] [--deepspeed] [--nvme-offload-dir NVME_OFFLOAD_DIR]
+                 [--extensions EXTENSIONS [EXTENSIONS ...]] [--verbose] [--idle-timeout IDLE_TIMEOUT] [--loader LOADER] [--cpu] [--auto-devices] [--gpu-memory GPU_MEMORY [GPU_MEMORY ...]]
+                 [--cpu-memory CPU_MEMORY] [--disk] [--disk-cache-dir DISK_CACHE_DIR] [--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code] [--force-safetensors] [--no_use_fast]
+                 [--use_flash_attention_2] [--use_eager_attention] [--torch-compile] [--load-in-4bit] [--use_double_quant] [--compute_dtype COMPUTE_DTYPE] [--quant_type QUANT_TYPE] [--flash-attn]
+                 [--tensorcores] [--n_ctx N_CTX] [--threads THREADS] [--threads-batch THREADS_BATCH] [--no_mul_mat_q] [--n_batch N_BATCH] [--no-mmap] [--mlock] [--n-gpu-layers N_GPU_LAYERS]
+                 [--tensor_split TENSOR_SPLIT] [--numa] [--logits_all] [--no_offload_kqv] [--cache-capacity CACHE_CAPACITY] [--row_split] [--streaming-llm] [--attention-sink-size ATTENTION_SINK_SIZE]
+                 [--tokenizer-dir TOKENIZER_DIR] [--gpu-split GPU_SPLIT] [--autosplit] [--max_seq_len MAX_SEQ_LEN] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa]
+                 [--num_experts_per_token NUM_EXPERTS_PER_TOKEN] [--enable_tp] [--hqq-backend HQQ_BACKEND] [--cpp-runner] [--cache_type CACHE_TYPE] [--deepspeed] [--nvme-offload-dir NVME_OFFLOAD_DIR]
                  [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE] [--compress_pos_emb COMPRESS_POS_EMB] [--listen] [--listen-port LISTEN_PORT]
                  [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH] [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE]
-                 [--subpath SUBPATH] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY] [--nowebui]
-                 [--multimodal-pipeline MULTIMODAL_PIPELINE] [--model_type MODEL_TYPE] [--pre_layer PRE_LAYER [PRE_LAYER ...]] [--checkpoint CHECKPOINT] [--monkey-patch] [--no_inject_fused_attention]
+                 [--subpath SUBPATH] [--old-colors] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY] [--api-enable-ipv6]
+                 [--api-disable-ipv4] [--nowebui] [--multimodal-pipeline MULTIMODAL_PIPELINE] [--cache_4bit] [--cache_8bit] [--chat-buttons] [--triton] [--no_inject_fused_mlp] [--no_use_cuda_fp16]
+                 [--desc_act] [--disable_exllama] [--disable_exllamav2] [--wbits WBITS] [--groupsize GROUPSIZE]
 
 Text generation web UI
 
@@ -236,12 +232,11 @@ Basic settings:
                                                  file will be loaded by default without the need to use the --settings flag.
   --extensions EXTENSIONS [EXTENSIONS ...]       The list of extensions to load. If you want to load more than one extension, write the names separated by spaces.
   --verbose                                      Print the prompts to the terminal.
-  --chat-buttons                                 Show buttons on the chat tab instead of a hover menu.
   --idle-timeout IDLE_TIMEOUT                    Unload model after this many minutes of inactivity. It will be automatically reloaded when you try to use it again.
 
 Model loader:
   --loader LOADER                                Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2,
-                                                 AutoGPTQ.
+                                                 HQQ, TensorRT-LLM.
 
 Transformers/Accelerate:
   --cpu                                          Use the CPU to generate text. Warning: Training on CPU is extremely slow.
@@ -259,6 +254,7 @@ Transformers/Accelerate:
   --no_use_fast                                  Set use_fast=False while loading the tokenizer (it's True by default). Use this if you have any problems related to use_fast.
   --use_flash_attention_2                        Set use_flash_attention_2=True while loading the model.
   --use_eager_attention                          Set attn_implementation= eager while loading the model.
+  --torch-compile                                Compile the model with torch.compile for improved performance.
 
 bitsandbytes 4-bit:
   --load-in-4bit                                 Load the model with 4-bit precision (using bitsandbytes).
@@ -268,7 +264,7 @@ bitsandbytes 4-bit:
 
 llama.cpp:
   --flash-attn                                   Use flash-attention.
-  --tensorcores                                  NVIDIA only: use llama-cpp-python compiled with tensor cores support. This may increase performance on newer cards.
+  --tensorcores                                  NVIDIA only: use llama-cpp-python compiled without GGML_CUDA_FORCE_MMQ. This may improve performance on newer cards.
   --n_ctx N_CTX                                  Size of the prompt context.
   --threads THREADS                              Number of threads to use.
   --threads-batch THREADS_BATCH                  Number of threads to use for batches/prompt processing.
@@ -295,19 +291,8 @@ ExLlamaV2:
   --no_flash_attn                                Force flash-attention to not be used.
   --no_xformers                                  Force xformers to not be used.
   --no_sdpa                                      Force Torch SDPA to not be used.
-  --cache_8bit                                   Use 8-bit cache to save VRAM.
-  --cache_4bit                                   Use Q4 cache to save VRAM.
   --num_experts_per_token NUM_EXPERTS_PER_TOKEN  Number of experts to use for generation. Applies to MoE models like Mixtral.
-
-AutoGPTQ:
-  --triton                                       Use triton.
-  --no_inject_fused_mlp                          Triton mode only: disable the use of fused MLP, which will use less VRAM at the cost of slower inference.
-  --no_use_cuda_fp16                             This can make models faster on some systems.
-  --desc_act                                     For models that do not have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig.
-  --disable_exllama                              Disable ExLlama kernel, which can improve inference speed on some systems.
-  --disable_exllamav2                            Disable ExLlamav2 kernel.
-  --wbits WBITS                                  Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.
-  --groupsize GROUPSIZE                          Group size.
+  --enable_tp                                    Enable Tensor Parallelism (TP) in ExLlamaV2.
 
 HQQ:
   --hqq-backend HQQ_BACKEND                      Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.
@@ -315,6 +300,9 @@ HQQ:
 TensorRT-LLM:
   --cpp-runner                                   Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn't support streaming yet.
 
+Cache:
+  --cache_type CACHE_TYPE                        KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.
+
 DeepSpeed:
   --deepspeed                                    Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.
   --nvme-offload-dir NVME_OFFLOAD_DIR            DeepSpeed: Directory to use for ZeRO-3 NVME offloading.
@@ -336,6 +324,7 @@ Gradio:
   --ssl-keyfile SSL_KEYFILE                      The path to the SSL certificate key file.
   --ssl-certfile SSL_CERTFILE                    The path to the SSL certificate cert file.
   --subpath SUBPATH                              Customize the subpath for gradio, use with reverse proxy
+  --old-colors                                   Use the legacy Gradio colors, before the December/2024 update.
 
 API:
   --api                                          Enable the API extension.
@@ -344,6 +333,8 @@ API:
   --api-port API_PORT                            The listening port for the API.
   --api-key API_KEY                              API authentication key.
   --admin-key ADMIN_KEY                          API authentication key for admin tasks like loading and unloading models. If not set, will be the same as --api-key.
+  --api-enable-ipv6                              Enable IPv6 for the API
+  --api-disable-ipv4                             Disable IPv4 for the API
   --nowebui                                      Do not launch the Gradio UI. Useful for launching the API in standalone mode.
 
 Multimodal:
@@ -368,7 +359,7 @@ text-generation-webui
     └── llama-2-13b-chat.Q4_K_M.gguf
 ```
 
-* The remaining model types (like 16-bit transformers models and GPTQ models) are made of several files and must be placed in a subfolder. Example:
+* The remaining model types (like 16-bit Transformers models and EXL2 models) are made of several files and must be placed in a subfolder. Example:
 
 ```
 text-generation-webui
@@ -409,3 +400,7 @@ https://colab.research.google.com/github/oobabooga/text-generation-webui/blob/ma
 ## Acknowledgment
 
 In August 2023, [Andreessen Horowitz](https://a16z.com/) (a16z) provided a generous grant to encourage and support my independent work on this project. I am **extremely** grateful for their trust and recognition.
+
+## ⭐ Featured Patreon Supporters
+
+* [Become the first one!](https://www.patreon.com/oobabooga)
diff --git a/css/Inter/Inter-Italic-VariableFont_opsz,wght.ttf b/css/Inter/Inter-Italic-VariableFont_opsz,wght.ttf
new file mode 100644
index 00000000..43ed4f5e
Binary files /dev/null and b/css/Inter/Inter-Italic-VariableFont_opsz,wght.ttf differ
diff --git a/css/Inter/Inter-VariableFont_opsz,wght.ttf b/css/Inter/Inter-VariableFont_opsz,wght.ttf
new file mode 100644
index 00000000..e31b51e3
Binary files /dev/null and b/css/Inter/Inter-VariableFont_opsz,wght.ttf differ
diff --git a/css/chat_style-cai-chat-square.css b/css/chat_style-cai-chat-square.css
index d626dbb1..854fff60 100644
--- a/css/chat_style-cai-chat-square.css
+++ b/css/chat_style-cai-chat-square.css
@@ -16,6 +16,6 @@
 }
 
 .message {
-    padding-bottom: 30px;
+    padding-bottom: 2em;
     grid-template-columns: 70px minmax(0, 1fr);
 }
diff --git a/css/chat_style-cai-chat.css b/css/chat_style-cai-chat.css
index 618184cf..93276bd3 100644
--- a/css/chat_style-cai-chat.css
+++ b/css/chat_style-cai-chat.css
@@ -1,7 +1,7 @@
 .message {
     display: grid;
     grid-template-columns: 60px minmax(0, 1fr);
-    padding-bottom: 15px;
+    padding-bottom: 2em;
     font-size: 15px;
     font-family: 'Noto Sans', Helvetica, Arial, sans-serif;
     line-height: 22.5px !important;
@@ -9,6 +9,7 @@
 
 .message-body {
     margin-top: 3px;
+    font-size: 15px !important;
 }
 
 .circle-you {
diff --git a/css/html_instruct_style.css b/css/html_instruct_style.css
index 50b9402f..fcd0558f 100644
--- a/css/html_instruct_style.css
+++ b/css/html_instruct_style.css
@@ -1,21 +1,40 @@
 .chat {
     background: transparent;
-    padding: 24px 19px;
-    padding-right: 19px !important;
+    padding: 0;
     padding-top: 0;
 }
 
-.chat > .messages {
-    padding-top: 18px !important;
+.chat > .messages:first-child {
+    padding-top: 0 !important;
 }
 
-.message {
-    display: grid;
-    grid-template-columns: 60px 1fr;
-    padding-bottom: 25px;
-    font-size: 15px;
-    font-family: 'Noto Sans', Helvetica, Arial, sans-serif;
-    line-height: 24px;
+.chat > .messages > :last-child {
+    margin-bottom: 1.7rem !important;
+}
+
+.chat .message-body p, .chat .message-body li {
+    font-size: 1rem !important;
+    line-height: 28px !important;
+}
+
+.dark .chat .message-body :is(p, li, q, h1, h2, h3, h4, h5, h6) {
+    color: #d1d5db !important;
+}
+
+.chat .message-body :is(p, ul, ol) {
+    margin: 1.25em 0 !important;
+}
+
+.chat .message-body :is(p, ul, ol):first-child {
+    margin-top: 0 !important;
+}
+
+.chat .message-body :is(p, ul, ol):last-child {
+    margin-bottom: 0 !important;
+}
+
+.user-message, .assistant-message {
+    font-family: Inter, Helvetica, Arial, sans-serif;
 }
 
 .message:first-child {
@@ -26,49 +45,45 @@
     display: none;
 }
 
-.message-body p, .message-body li {
-    font-size: 15px !important;
-    line-height: 24px !important;
-}
-
-.message-body p, .chat .message-body ul, .chat .message-body ol {
-    margin-bottom: 16px !important;
-}
-
-.message-body p:last-child, .chat .message-body ul:last-child, .chat .message-body ol:last-child {
-    margin-bottom: 0 !important;
-}
-
-.gradio-container .chat .assistant-message {
-    padding: 20px;
+.chat .user-message {
     background: #f4f4f4;
-    margin-top: 9px !important;
-    margin-bottom: 12px !important;
-    border-radius: 7px;
-    border: 1px solid var(--border-color-primary);
-}
-
-.dark .chat .assistant-message {
-    background: var(--color-grey-800);
-}
-
-.gradio-container .chat .user-message {
-    padding: 20px;
-    padding-left: 0;
-    padding-right: 0;
-    background-color: transparent;
-    border-radius: 8px;
+    padding: 1.5rem 1rem;
+    padding-bottom: 2rem;
+    border-radius: 0;
     border-bottom-right-radius: 0;
 }
 
-.gradio-container .chat .assistant-message:last-child, .gradio-container .chat .user-message:last-child {
-    margin-bottom: 0 !important;
+.chat .assistant-message {
+    padding: 1.5rem 1rem;
+    padding-bottom: 2rem;
+    border-radius: 0;
+    border: 0;
 }
 
-code {
+.dark .chat .user-message {
+    background: transparent;
+}
+
+.dark .chat .assistant-message {
+    background: var(--light-gray);
+}
+
+.chat .user-message .text,
+.chat .assistant-message .text {
+    max-width: 645px;
+    margin-left: auto;
+    margin-right: auto;
+}
+
+/* Create space between two assistant messages in a row */
+.assistant-message + .assistant-message {
+    margin-top: 1.5rem;
+}
+
+pre > code {
     background-color: #f3f4f6 !important;
 }
 
-.dark code {
+.dark pre > code {
     background-color: #1f2937 !important;
 }
diff --git a/css/main.css b/css/main.css
index cf3babdb..1a7efe70 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1,7 +1,46 @@
+:root {
+    --darker-gray: #202123;
+    --dark-gray: #343541;
+    --light-gray: #444654;
+    --light-theme-gray: #f4f4f4;
+    --border-color-dark: #525252;
+    --header-width: 112px;
+    --selected-item-color-dark: #32333e;
+}
+
+@font-face {
+    font-family: Inter;
+    src: url('file/css/Inter/Inter-VariableFont_opsz,wght.ttf') format('truetype');
+    font-weight: 100 900;
+    font-style: normal;
+}
+
+@font-face {
+    font-family: Inter;
+    src: url('file/css/Inter/Inter-Italic-VariableFont_opsz,wght.ttf') format('truetype');
+    font-weight: 100 900;
+    font-style: italic;
+}
+
 .tabs.svelte-710i53 {
     margin-top: 0
 }
 
+.padded.svelte-12cmxck {
+    padding: 3px 0;
+}
+
+div.svelte-sfqy0y,
+div.svelte-iyf88w {
+    background: transparent;
+    border: 0;
+}
+
+/* "info" messages without a title above */
+.block > .svelte-e8n7p6:not(:only-of-type, #chat-mode *) {
+    margin-bottom: 0;
+}
+
 .py-6 {
     padding-top: 2.5rem
 }
@@ -19,7 +58,7 @@
     height: 39.594px;
     align-self: end;
     line-height: 1em;
-    border-radius: 0.5em;
+    border-radius: 0.375rem;
     flex: none;
 }
 
@@ -46,10 +85,6 @@
     min-height: 0
 }
 
-.dark svg {
-    fill: white;
-}
-
 .dark a {
     color: white !important;
 }
@@ -62,14 +97,20 @@ ol li p, ul li p {
     border: 0;
 }
 
+#default-tab, #notebook-tab, #parameters, #chat-settings, #lora, #training-tab, #model-tab, #session-tab {
+    padding: 1rem;
+}
+
 .gradio-container {
     max-width: 100% !important;
     padding-top: 0 !important;
 }
 
 #extensions {
-    margin-top: 5px;
-    margin-bottom: 35px;
+    margin: 5px auto 35px;
+    max-width: 880px;
+    padding: 1em;
+    padding-left: calc(var(--header-width) + 1em);
 }
 
 .extension-tab {
@@ -86,20 +127,29 @@ div.svelte-15lo0d8 > *, div.svelte-15lo0d8 > .form > * {
 }
 
 gradio-app > :first-child {
-    padding-left: var(--size-4) !important;
-    padding-right: var(--size-4) !important;
+    padding: 0 !important;
 }
 
 .header_bar {
-    background-color: #f4f4f4;
     box-shadow: 0 0 3px rgba(22 22 22 / 35%);
     margin-bottom: 0;
     overflow-x: scroll;
-    margin-left: calc(-1 * var(--size-4));
-    margin-right: calc(-1 * var(--size-4));
-    display: block !important;
     text-wrap: nowrap;
     z-index: 90;
+    position: fixed;
+    display: flex !important;
+    flex-direction: column;
+    height: 100dvh;
+    width: var(--header-width);
+}
+
+.header_bar button {
+    margin: 0;
+    padding: 0.75rem;
+}
+
+.header_bar button.selected {
+    border: 0;
 }
 
 .dark .header_bar {
@@ -113,23 +163,23 @@ gradio-app > :first-child {
 }
 
 .textbox_default textarea {
-    height: calc(100dvh - 271px);
+    height: calc(100dvh - 201px);
 }
 
 .textbox_default_output textarea {
-    height: calc(100dvh - 185px);
+    height: calc(100dvh - 117px);
 }
 
 .textbox textarea {
-    height: calc(100dvh - 241px);
+    height: calc(100dvh - 172px);
 }
 
 .textbox_logits textarea {
-    height: calc(100dvh - 236px);
+    height: calc(100dvh - 205px);
 }
 
 .textbox_logits_notebook textarea {
-    height: calc(100dvh - 292px);
+    height: calc(100dvh - 221px);
 }
 
 .monospace textarea {
@@ -149,24 +199,6 @@ gradio-app > :first-child {
     color: #efefef !important;
 }
 
-@media screen and (width <= 711px) {
-    .textbox_default textarea {
-        height: calc(100dvh - 259px);
-    }
-
-    div .default-token-counter {
-        top: calc( 0.5 * (100dvh - 236px) ) !important;
-    }
-
-    .transparent-substring {
-        display: none;
-    }
-
-    .hover-menu {
-        min-width: 250px !important;
-    }
-}
-
 /* Hide the gradio footer */
 footer {
     display: none !important;
@@ -194,6 +226,7 @@ button {
     max-width: 500px;
     background-color: var(--input-background-fill);
     border: var(--input-border-width) solid var(--input-border-color) !important;
+    padding: 10px;
 }
 
 .file-saver > :first-child > :last-child {
@@ -227,11 +260,13 @@ button {
 .pretty_scrollbar::-webkit-scrollbar-thumb,
 .pretty_scrollbar::-webkit-scrollbar-thumb:hover {
     background: var(--neutral-300);
+    border-radius: 30px;
 }
 
 .dark .pretty_scrollbar::-webkit-scrollbar-thumb,
 .dark .pretty_scrollbar::-webkit-scrollbar-thumb:hover {
-    background: var(--neutral-700);
+    background: #ccc;
+    border-radius: 10px;
 }
 
 .pretty_scrollbar::-webkit-resizer {
@@ -239,7 +274,8 @@ button {
 }
 
 .dark .pretty_scrollbar::-webkit-resizer {
-    background: #374151;
+    background: #ccc;
+    border-radius: 10px;
 }
 
 .pretty_scrollbar::-webkit-scrollbar-corner {
@@ -251,20 +287,26 @@ audio {
 }
 
 /* Copied from https://github.com/AUTOMATIC1111/stable-diffusion-webui */
-.token-counter {
+#default-token-counter, #notebook-token-counter {
     position: absolute !important;
-    top: calc( 0.5 * (100dvh - 218px) ) !important;
-    right: 2px;
     z-index: 100;
     background: var(--input-background-fill) !important;
     min-height: 0 !important;
+    width: 0;
+    text-align: left;
+    direction: rtl;
+    right: 5px;
 }
 
-.default-token-counter {
-    top: calc( 0.5 * (100dvh - 248px) ) !important;
+#default-token-counter {
+    top: calc(100dvh - 200px) !important;
 }
 
-.token-counter span {
+#notebook-token-counter {
+    top: calc(100dvh - 171px) !important;
+}
+
+#default-token-counter span, #notebook-token-counter span {
     padding: 1px;
     box-shadow: 0 0 0 0.3em rgb(192 192 192 / 15%), inset 0 0 0.6em rgb(192 192 192 / 7.5%);
     border: 2px solid rgb(192 192 192 / 40%) !important;
@@ -272,15 +314,15 @@ audio {
 }
 
 .no-background {
-    background: var(--background-fill-primary) !important;
+    background: transparent;
     padding: 0 !important;
 }
 
 /* ----------------------------------------------
   Chat tab
 ---------------------------------------------- */
-.h-\[40vh\], .wrap.svelte-byatnx.svelte-byatnx.svelte-byatnx {
-    height: 66.67vh
+.h-\[40dvh\], .wrap.svelte-byatnx.svelte-byatnx.svelte-byatnx {
+    height: 66.67dvh
 }
 
 .gradio-container {
@@ -310,7 +352,13 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 }
 
 #chat-tab {
-    padding-top: 0;
+    padding: 0;
+}
+
+#chat-tab > :nth-child(1) {
+    display: flex;
+    flex-direction: row;
+    gap: 0;
 }
 
 #chat-tab button#Generate, #chat-tab button#stop {
@@ -322,7 +370,6 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 }
 
 #chat-tab > :first-child, #extensions {
-    max-width: 880px;
     margin-left: auto;
     margin-right: auto;
 }
@@ -342,69 +389,101 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 .chat {
     margin-left: auto;
     margin-right: auto;
-    max-width: 880px;
     min-height: var(--chat-height);
     overflow-y: auto;
-    padding-right: 15px;
     display: flex;
     flex-direction: column;
     word-break: break-word;
     overflow-wrap: anywhere;
     border-top: none;
-    border-radius: 0 0 0 8px;
+    border-radius: 0;
     visibility: visible;
 }
 
 .chat-parent {
-    height: calc(100dvh - 98px - var(--header-height) - var(--input-delta));
+    height: calc(100dvh - 98px - var(--input-delta));
     overflow: auto !important;
     border-radius: 0 !important;
     margin-bottom: var(--input-delta) !important;
 }
 
-/* On desktop, automatically hide the chat scroll bar
- * when not hovered. */
-@media (hover: hover) and (pointer: fine) {
-    .chat-parent {
-        visibility: hidden;
-    }
-
-    .chat-parent:focus, .chat-parent:hover {
-        visibility: visible;
-    }
-}
-
 .chat-parent .prose {
     visibility: visible;
 }
 
-.old-ui .chat-parent {
-    height: calc(100dvh - 192px - var(--header-height) - var(--input-delta));
-    margin-bottom: var(--input-delta) !important;
+.chat .message {
+    width: min(100%, 48rem);
+    margin-left: auto;
+    margin-right: auto;
+    text-align: start;
+    padding-left: 1rem;
+    padding-right: 1rem;
 }
 
 .chat-parent.bigchat {
-    height: calc(100dvh - 98px - var(--header-height) - var(--input-delta)) !important;
+    height: calc(100dvh - 98px - var(--input-delta)) !important;
     margin-bottom: var(--input-delta) !important;
 }
 
 .chat > .messages {
     display: flex;
     flex-direction: column;
-    padding-top: 25px;
 }
 
-.chat .message:last-child {
-    margin-bottom: 0 !important;
-    padding-bottom: 15px !important;
+.chat > .messages > :first-child {
+    padding-top: 20px;
+}
+
+.message-body {
+    font-size: 16px;
+}
+
+.dark .message-body :is(h1, h2, h3, h4, h5, h6) {
+    color: white !important;
+}
+
+.message-body h1 {
+    font-weight: 800;
+    font-size: 2.25em;
+    margin-top: 0;
+    margin-bottom: 0.8888889em;
+    line-height: 1.1111111;
+}
+
+.message-body h2 {
+    font-weight: 700;
+    font-size: 1.5em;
+    margin-top: 2em;
+    margin-bottom: 1em;
+    line-height: 1.3333333;
+}
+
+.message-body h3 {
+    font-weight: 600;
+    font-size: 1.25em;
+    margin-top: 1.6em;
+    margin-bottom: 0.6em;
+    line-height: 1.6;
 }
 
-.message-body h1,
-.message-body h2,
-.message-body h3,
 .message-body h4 {
-    color: var(--body-text-color);
-    margin: 20px 0 10px 0;
+    font-weight: 600;
+    font-size: 1em;
+    margin-top: 1.5em;
+    margin-bottom: 0.5em;
+    line-height: 1.5;
+}
+
+.message-body h5 {
+    font-weight: normal;
+    font-size: 1em;
+    margin: 0;
+}
+
+.message-body h6 {
+    font-weight: normal;
+    font-size: 1em;
+    margin: 0;
 }
 
 .dark .message q {
@@ -417,29 +496,28 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 
 .message-body li {
     list-style-position: outside;
+    margin-top: 0.5em !important;
+    margin-bottom: 0.5em !important;
+}
+
+.message-body ul.long-list > li,
+.message-body ol.long-list > li {
+    margin-top: 1.25em !important;
+    margin-bottom: 1.25em !important;
+}
+
+.message-body a {
+    font-weight: 500;
 }
 
 .chat .message-body ul, .chat .message-body ol {
     padding-inline-start: 2em;
 }
 
-.message-body li:not(:last-child) {
-    margin-top: 0 !important;
-    margin-bottom: 2px !important;
-}
-
-.message-body li:last-child {
-    margin-bottom: 0 !important;
-}
-
 .message-body li > p {
     display: inline !important;
 }
 
-.message-body ul, .message-body ol {
-    font-size: 15px !important;
-}
-
 .message-body ul {
     list-style-type: disc !important;
 }
@@ -456,9 +534,14 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     overflow: scroll;
 }
 
-.message-body code {
-    white-space: pre-wrap !important;
-    word-wrap: break-word !important;
+.prose ul ul {
+    margin: 0;
+}
+
+.message-body pre > code {
+    white-space: pre !important;
+    overflow-x: auto !important;
+    max-width: calc(100dvw - 39px);
     border: 1px solid #666;
     border-radius: 5px;
     font-size: 82%;
@@ -467,7 +550,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     color: #1f2328;
 }
 
-.dark .message-body code {
+.dark .message-body pre > code {
     background: #0d1117 !important;
     color: rgb(201 209 217);
 }
@@ -477,8 +560,18 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     padding: 15px;
 }
 
+.message-body :not(pre) > code::before {
+    content: "`";
+}
+
+.message-body :not(pre) > code::after {
+    content: "`";
+}
+
 .message-body :not(pre) > code {
     white-space: normal !important;
+    font-weight: bold;
+    font-family: unset;
 }
 
 #chat-input {
@@ -488,6 +581,15 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     border: none;
 }
 
+#chat-input textarea {
+    padding: 0.65rem 2.5rem;
+}
+
+#chat-input textarea::placeholder {
+    white-space: nowrap;
+    overflow: hidden;
+}
+
 #chat-input textarea:focus {
     box-shadow: none !important;
 }
@@ -500,6 +602,14 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     display: none;
 }
 
+.chat-input-positioned {
+    position: absolute;
+    bottom: 0;
+    max-width: 54rem;
+    left: 50%;
+    transform: translateX(-50%);
+}
+
 @media print {
     body {
         visibility: hidden;
@@ -535,7 +645,6 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 
 #show-controls {
     position: absolute;
-    height: 100%;
     background-color: transparent;
     border: 0 !important;
     border-radius: 0;
@@ -544,7 +653,8 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 #show-controls label {
     z-index: 1000;
     position: absolute;
-    right: 0;
+    right: 30px;
+    top: 10px;
     white-space: nowrap;
     overflow: hidden;
     text-overflow: ellipsis;
@@ -626,7 +736,6 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     position: absolute;
     bottom: 80%;
     left: 0;
-    background-color: var(--background-fill-primary);
     box-shadow: 0 0 5px rgb(0 0 0 / 25%);
     z-index: 10000;
     min-width: 330px;
@@ -637,7 +746,6 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     width: 100%;
     background: transparent !important;
     border-radius: 0 !important;
-    border-color: var(--border-color-primary);
     justify-content: space-between;
     margin: 0 !important;
     height: 36px;
@@ -659,7 +767,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     opacity: 0.333;
 }
 
-#chat-tab:not(.old-ui) #chat-buttons {
+#chat-tab #chat-buttons {
     display: none !important;
 }
 
@@ -690,57 +798,91 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 }
 
 #chat-input-row {
-    padding-bottom: 20px;
+    padding-bottom: 1.5em;
+    padding-left: 1rem;
+    padding-right: 1rem;
 }
 
-.old-ui #chat-input-row, #chat-input-row.bigchat {
-    padding-bottom: 0 !important;
+#chat-input-row.bigchat {
+    padding-bottom: 1px !important;
 }
 
 #chat-col {
     padding-bottom: 100px;
 }
 
-.old-ui #chat-col, #chat-col.bigchat {
+@media screen and (width <= 924px) {
+    #chat-col {
+        padding-bottom: 100px;
+        margin-top: 32px;
+        position: relative; /* Ensure positioning for the pseudo-element */
+    }
+
+    .chat-parent {
+        height: calc(100dvh - 98px - var(--input-delta) - 32px);
+    }
+
+    .chat-parent.bigchat {
+        height: calc(100dvh - 98px - var(--input-delta) - 32px) !important;
+    }
+}
+
+#chat-col.bigchat {
     padding-bottom: 80px !important;
 }
 
-.old-ui #chat-buttons #clear-history-confirm {
-    order: -1;
-}
-
-.chat ol, .chat ul {
-    margin-top: 6px !important;
+.message-body ol, .message-body ul {
+    margin-top: 0 !important;
+    margin-bottom: 1.25em !important;
 }
 
 /* ----------------------------------------------
   Past chats menus
 ---------------------------------------------- */
 #rename-row label {
-    margin-top: var(--layout-gap);
+    margin-top: 0;
+}
+
+#rename-row > :nth-child(2) {
+    justify-content: center;
 }
 
 /* ----------------------------------------------
-  Past chat histories in a side bar on desktop
+  Create the sidebars
 ---------------------------------------------- */
-@media screen and (width >= 1327px) {
-    #past-chats-row {
-        position: absolute;
-        top: 36px;
-        left: 0;
-        width: calc(0.5*(var(--document-width) - 880px - 120px - 16px*2));
-        max-width: 300px;
-        margin-left: calc(-0.5*(var(--document-width) - 880px - 14px - 16px * 2));
-    }
+#chat-controls,
+#past-chats-row {
+    width: 260px;
+    max-width: 80vw;
+    padding: 0.5rem;
+    height: 100dvh;
+    flex-shrink: 0;
+    box-sizing: content-box;
+    z-index: 10;
+}
 
-    #chat-controls {
-        position: absolute;
-        top: 16px;
-        right: 0;
-        width: calc(0.5*(var(--document-width) - 880px - 120px - 16px*2));
-        max-width: 400px;
-        margin-right: calc(-0.5*(var(--document-width) - 880px - 14px - 16px * 2));
-    }
+#past-chats-row:not(.negative-header) {
+    max-width: calc(85vw - var(--header-width));
+}
+
+#chat-controls {
+    padding: 1rem;
+    padding-bottom: 0;
+    overflow-y: scroll;
+}
+
+#chat-controls > :nth-child(1) {
+    padding: 0.5rem;
+}
+
+#past-chats-row + * {
+    width: unset;
+    flex-grow: 1;
+    flex-shrink: 1;
+}
+
+#search_chat > :nth-child(2) > :first-child {
+    display: none;
 }
 
 /* ----------------------------------------------
@@ -748,6 +890,8 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 ---------------------------------------------- */
 .options {
     z-index: 100 !important;
+    border: 1px solid var(--input-border-color);
+    border-radius: 0;
 }
 
 /* ----------------------------------------------
@@ -757,12 +901,12 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     position: fixed;
     bottom: 0;
     left: 0;
-    width: calc((100vw - 880px - 120px) /2);
+    width: calc(100vw / 2 - 600px);
+    z-index: 10000;
 }
 
 .pfp_character {
     position: relative;
-    z-index: 100;
 }
 
 .pfp_character:hover {
@@ -776,10 +920,14 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 }
 
 #past-chats {
-    max-height: calc(100vh - 195px);
+    max-height: calc(100dvh - 135px);
     overflow-y: scroll !important;
     border-radius: 0;
-    scrollbar-width: none; /* Hide scrollbar in Firefox by default */
+    scrollbar-width: auto;
+}
+
+#past-chats::-webkit-scrollbar {
+    display: block;
 }
 
 #past-chats label {
@@ -790,6 +938,24 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     border-radius: 0;
     padding-top: 8px;
     padding-bottom: 8px;
+    position: relative;
+    min-height: 42px !important;
+}
+
+#past-chats label::before {
+    content: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" width="18" height="18" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M8 9h8"></path><path d="M8 13h6"></path><path d="M18 4a3 3 0 0 1 3 3v8a3 3 0 0 1 -3 3h-5l-5 3v-3h-2a3 3 0 0 1 -3 -3v-8a3 3 0 0 1 3 -3h12z"></path></svg>');
+    position: absolute;
+    top: 12px;
+    left: 12px;
+    margin-right: 8px;
+}
+
+.dark #past-chats label::before {
+    content: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" width="18" height="18" viewBox="0 0 24 24" fill="none" stroke="white" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M8 9h8"></path><path d="M8 13h6"></path><path d="M18 4a3 3 0 0 1 3 3v8a3 3 0 0 1 -3 3h-5l-5 3v-3h-2a3 3 0 0 1 -3 -3v-8a3 3 0 0 1 3 -3h12z"></path></svg>');
+}
+
+#past-chats label span {
+    margin-left: 29px;
 }
 
 #past-chats > :nth-child(2) {
@@ -797,23 +963,321 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 }
 
 #past-chats > :nth-child(3) {
-    gap: 0;
+    gap: 0.25rem;
 }
 
-#past-chats::-webkit-scrollbar {
+#past-chats input {
     display: none;
 }
 
-#past-chats:hover {
-    scrollbar-width: auto;
+#past-chats label {
+    padding: 0.75rem;
+    font-size: 12.5px;
+    font-weight: 400;
 }
 
-#past-chats:hover::-webkit-scrollbar {
-    display: block;
+#past-chats .selected,
+#past-chats label:hover {
+    border-radius: 0.5rem;
 }
 
-@media screen and (width < 1327px) {
-    #past-chats {
-        max-height: 300px;
+#past-chats label:hover {
+    cursor: pointer;
+}
+
+#past-chats-buttons,
+#delete-chat-row,
+#rename-row {
+    width: 100%;
+    justify-content: center;
+    gap: 9px;
+}
+
+
+#past-chats-row,
+#chat-controls {
+    width: 260px;
+    padding: 0.5rem;
+    height: calc(100dvh - 16px);
+    flex-shrink: 0;
+    box-sizing: content-box;
+}
+
+.sidebar-hidden {
+    width: 0 !important;
+    padding: 0 !important;
+    overflow: hidden;
+}
+
+#past-chats-toggle,
+#chat-controls-toggle,
+#navigation-toggle {
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    cursor: pointer;
+    user-select: none;
+    border-radius: 3px;
+    z-index: 1000;
+    position: fixed;
+    width: 2rem;
+    height: 2rem;
+    top: 0;
+}
+
+#past-chats-toggle svg,
+#chat-controls-toggle svg,
+#navigation-toggle svg {
+    pointer-events: none;
+}
+
+@media screen and (width <= 408px) {
+    #past-chats-toggle.past-chats-open {
+        top: 28px;
+    }
+
+    #chat-controls-toggle.chat-controls-open {
+        top: 28px;
+        right: calc(16px + min(260px, 80vw)) !important;
     }
 }
+
+#past-chats-toggle.past-chats-open.negative-header {
+    left: calc(min(260px, 85vw) + 16px);
+}
+
+#past-chats-toggle.past-chats-open:not(.negative-header) {
+    left: calc(112px + min(260px, calc(85vw - var(--header-width))) + 16px);
+}
+
+#past-chats-toggle.past-chats-closed:not(.negative-header) {
+    left: 112px;
+}
+
+#past-chats-toggle.past-chats-closed.negative-header {
+    left: 0;
+    top: 28px;
+}
+
+@media screen and (width <= 924px) {
+    #past-chats-toggle.past-chats-closed.negative-header {
+        left: 28px;
+        top: 0;
+    }
+}
+
+.header_bar ~ * {
+    margin-left: var(--header-width);
+}
+
+/* Positions for chat-controls-toggle */
+#chat-controls-toggle.chat-controls-open {
+    right: calc(min(260px, 80vw) + 23px);
+}
+
+#chat-controls-toggle.chat-controls-closed {
+    right: 7px;
+}
+
+@media screen and (width <= 924px) {
+    #chat-controls.sidebar-shown {
+        position: fixed;
+        right: 0;
+    }
+
+    #past-chats-row.sidebar-shown {
+        position: fixed;
+    }
+}
+
+/* ----------------------------------------------
+  Dark theme
+---------------------------------------------- */
+.dark .header_bar {
+    background-color: var(--darker-gray) !important;
+}
+
+.dark .header_bar button.selected {
+    background: var(--selected-item-color-dark);
+}
+
+.dark #chat-input textarea {
+    background: var(--light-gray);
+    color: white !important;
+    border-color: #292c3b;
+}
+
+.dark #chat-input textarea::placeholder {
+    color: #9ca3af;
+}
+
+.dark .hover-menu {
+    background-color: var(--darker-gray);
+}
+
+.dark .hover-menu button {
+    border-color: var(--border-color-primary);
+}
+
+.dark #chat-controls,
+.dark #past-chats-row {
+    background-color: var(--darker-gray);
+    border: 0 !important;
+}
+
+.dark #past-chats .selected,
+.dark #past-chats label:hover {
+    background-color: var(--selected-item-color-dark) !important;
+}
+
+.dark #past-chats-row,
+.dark #chat-controls {
+    background-color: var(--darker-gray);
+}
+
+.dark #past-chats-toggle,
+.dark #chat-controls-toggle,
+.dark #navigation-toggle {
+    color: white;
+}
+
+.dark svg {
+    color: white;
+}
+
+@media screen and (width <= 408px) {
+    .dark #past-chats-toggle.past-chats-open {
+        background: var(--darker-gray);
+    }
+
+    .dark #chat-controls-toggle.chat-controls-open {
+        background: var(--darker-gray);
+    }
+}
+
+/* ----------------------------------------------
+  Light theme
+---------------------------------------------- */
+.header_bar {
+    background-color: var(--light-theme-gray) !important;
+}
+
+.header_bar button.selected {
+    background: white;
+}
+
+#chat-controls,
+#past-chats-row {
+    background-color: var(--light-theme-gray);
+}
+
+#chat-controls {
+    border-left: 1px solid #d9d9d0;
+}
+
+#past-chats-row {
+    border-right: 1px solid #d9d9d0;
+}
+
+#past-chats-toggle,
+#chat-controls-toggle,
+#navigation-toggle {
+    color: gray !important;
+}
+
+.mobile-top-bar {
+    position: fixed;
+    top: 0;
+    left: 0;
+    width: 100%;
+    height: 32px;
+    z-index: 2;
+    opacity: 0;
+    pointer-events: none;
+}
+
+@media screen and (width <= 924px) {
+    .mobile-top-bar {
+        opacity: 1;
+        pointer-events: auto;
+    }
+
+    .dark .mobile-top-bar {
+        background-color: var(--darker-gray);
+    }
+
+    .mobile-top-bar {
+        background-color: var(--light-theme-gray);
+    }
+}
+
+@media screen and (width <= 408px) {
+    #past-chats-toggle.past-chats-open {
+        background: var(--light-theme-gray);
+    }
+
+    #chat-controls-toggle.chat-controls-open {
+        background: var(--light-theme-gray);
+    }
+}
+
+/* ----------------------------------------------
+  Copy button for chat messages
+---------------------------------------------- */
+.message .text,
+.message .text-you,
+.message .text-bot,
+.user-message .text,
+.assistant-message .text {
+    position: relative;
+}
+
+.message, .user-message, .assistant-message {
+    position: relative;
+}
+
+.footer-button {
+    position: absolute;
+    padding: 0;
+    margin: 0;
+    border: none;
+    border-radius: 3px;
+    cursor: pointer;
+    opacity: 0;
+    display: flex;
+    align-items: center;
+    transition: opacity 0.2s;
+}
+
+.footer-button.footer-copy-button {
+    bottom: -23px;
+    left: 0;
+}
+
+.footer-button.footer-refresh-button {
+    bottom: -23px;
+    left: 25px;
+}
+
+.message:hover .footer-button,
+.user-message:hover .footer-button,
+.assistant-message:hover .footer-button {
+    opacity: 1;
+}
+
+.footer-button svg {
+    stroke: rgb(156 163 175);
+    transition: stroke 0.2s;
+}
+
+.footer-button:hover svg {
+    stroke: rgb(107 114 128);
+}
+
+.dark .footer-button svg {
+    stroke: rgb(156 163 175);
+}
+
+.dark .footer-button:hover svg {
+    stroke: rgb(209 213 219);
+}
diff --git a/docker/.env.example b/docker/.env.example
index 2de9f0ab..bd0f8bcc 100644
--- a/docker/.env.example
+++ b/docker/.env.example
@@ -1,6 +1,7 @@
 # by default the Dockerfile specifies these versions: 3.5;5.0;6.0;6.1;7.0;7.5;8.0;8.6+PTX
 # however for me to work i had to specify the exact version for my card ( 2060 ) it was 7.5
 # https://developer.nvidia.com/cuda-gpus you can find the version for your card here
+# Or for a programatic approach run `nvidia-smi --query-gpu=name,compute_cap --format=csv`
 TORCH_CUDA_ARCH_LIST=7.5
 # the port the webui binds to on the host
 HOST_PORT=7860
diff --git a/docker/amd/Dockerfile b/docker/amd/Dockerfile
index 365e88e3..cfbcf7e4 100644
--- a/docker/amd/Dockerfile
+++ b/docker/amd/Dockerfile
@@ -13,7 +13,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,rw \
 WORKDIR /home/app/
 RUN git clone https://github.com/oobabooga/text-generation-webui.git 
 WORKDIR /home/app/text-generation-webui
-RUN GPU_CHOICE=B USE_CUDA118=FALSE LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
+RUN GPU_CHOICE=C LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
 COPY CMD_FLAGS.txt /home/app/text-generation-webui/
 EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005}
 WORKDIR /home/app/text-generation-webui
diff --git a/docker/cpu/Dockerfile b/docker/cpu/Dockerfile
index 04ccf94a..8f643d2f 100644
--- a/docker/cpu/Dockerfile
+++ b/docker/cpu/Dockerfile
@@ -17,7 +17,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,rw \
 WORKDIR /home/app/
 RUN git clone https://github.com/oobabooga/text-generation-webui.git 
 WORKDIR /home/app/text-generation-webui
-RUN GPU_CHOICE=N USE_CUDA118=FALSE LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
+RUN GPU_CHOICE=N LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
 COPY CMD_FLAGS.txt /home/app/text-generation-webui/
 EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005}
 # set umask to ensure group read / write at runtime
diff --git a/docker/intel/Dockerfile b/docker/intel/Dockerfile
index bc67a185..d2ed671e 100644
--- a/docker/intel/Dockerfile
+++ b/docker/intel/Dockerfile
@@ -13,7 +13,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,rw \
 WORKDIR /home/app/
 RUN git clone https://github.com/oobabooga/text-generation-webui.git 
 WORKDIR /home/app/text-generation-webui
-RUN GPU_CHOICE=D USE_CUDA118=FALSE LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
+RUN GPU_CHOICE=E LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
 COPY CMD_FLAGS.txt /home/app/text-generation-webui/
 EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005}
 # set umask to ensure group read / write at runtime
diff --git a/docker/nvidia/Dockerfile b/docker/nvidia/Dockerfile
index 66a717a7..900a4329 100644
--- a/docker/nvidia/Dockerfile
+++ b/docker/nvidia/Dockerfile
@@ -13,7 +13,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,rw \
 WORKDIR /home/app/
 RUN git clone https://github.com/oobabooga/text-generation-webui.git 
 WORKDIR /home/app/text-generation-webui
-RUN GPU_CHOICE=A USE_CUDA118=FALSE LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
+RUN GPU_CHOICE=A LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
 COPY CMD_FLAGS.txt /home/app/text-generation-webui/
 EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005}
 WORKDIR /home/app/text-generation-webui
diff --git a/docs/12 - OpenAI API.md b/docs/12 - OpenAI API.md
index b00a1f34..9b4f89bf 100644
--- a/docs/12 - OpenAI API.md	
+++ b/docs/12 - OpenAI API.md	
@@ -19,7 +19,7 @@ Add `--api` to your command-line flags.
 
 ### Examples
 
-For the documentation with all the parameters and their types, consult `http://127.0.0.1:5000/docs` or the [typing.py](https://github.com/oobabooga/text-generation-webui/blob/main/extensions/openai/typing.py) file.
+For the documentation with all the endpoints, parameters and their types, consult `http://127.0.0.1:5000/docs` or the [typing.py](https://github.com/oobabooga/text-generation-webui/blob/main/extensions/openai/typing.py) file.
 
 The official examples in the [OpenAI documentation](https://platform.openai.com/docs/api-reference) should also work, and the same parameters apply (although the API here has more optional parameters).
 
@@ -114,6 +114,30 @@ curl -k http://127.0.0.1:5000/v1/internal/logits \
   }'
 ```
 
+#### List models
+
+```shell
+curl -k http://127.0.0.1:5000/v1/internal/model/list \
+  -H "Content-Type: application/json"
+```
+
+#### Load model
+
+```shell
+curl -k http://127.0.0.1:5000/v1/internal/model/load \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model_name": "model_name",
+    "args": {
+      "load_in_4bit": true,
+      "n_gpu_layers": 12
+    },
+    "settings": {
+      "instruction_template": "Alpaca"
+    }
+  }'
+```
+
 #### Python chat example
 
 ```python
diff --git a/download-model.py b/download-model.py
index 306784a3..8fe94371 100644
--- a/download-model.py
+++ b/download-model.py
@@ -72,7 +72,7 @@ class ModelDownloader:
 
         return model, branch
 
-    def get_download_links_from_huggingface(self, model, branch, text_only=False, specific_file=None):
+    def get_download_links_from_huggingface(self, model, branch, text_only=False, specific_file=None, exclude_pattern=None):
         session = self.session
         page = f"/api/models/{model}/tree/{branch}"
         cursor = b""
@@ -100,13 +100,17 @@ class ModelDownloader:
                 if specific_file not in [None, ''] and fname != specific_file:
                     continue
 
+                # Exclude files matching the exclude pattern
+                if exclude_pattern is not None and re.match(exclude_pattern, fname):
+                    continue
+
                 if not is_lora and fname.endswith(('adapter_config.json', 'adapter_model.bin')):
                     is_lora = True
 
                 is_pytorch = re.match(r"(pytorch|adapter|gptq)_model.*\.bin", fname)
                 is_safetensors = re.match(r".*\.safetensors", fname)
                 is_pt = re.match(r".*\.pt", fname)
-                is_gguf = re.match(r'.*\.gguf', fname)
+                is_gguf = re.match(r".*\.gguf", fname)
                 is_tiktoken = re.match(r".*\.tiktoken", fname)
                 is_tokenizer = re.match(r"(tokenizer|ice|spiece).*\.model", fname) or is_tiktoken
                 is_text = re.match(r".*\.(txt|json|py|md)", fname) or is_tokenizer
@@ -140,7 +144,6 @@ class ModelDownloader:
 
         # If both pytorch and safetensors are available, download safetensors only
         # Also if GGUF and safetensors are available, download only safetensors
-        # (why do people do this?)
         if (has_pytorch or has_pt or has_gguf) and has_safetensors:
             has_gguf = False
             for i in range(len(classifications) - 1, -1, -1):
@@ -148,8 +151,6 @@ class ModelDownloader:
                     links.pop(i)
 
         # For GGUF, try to download only the Q4_K_M if no specific file is specified.
-        # If not present, exclude all GGUFs, as that's likely a repository with both
-        # GGUF and fp16 files.
         if has_gguf and specific_file is None:
             has_q4km = False
             for i in range(len(classifications) - 1, -1, -1):
@@ -312,6 +313,7 @@ if __name__ == '__main__':
     parser.add_argument('--threads', type=int, default=4, help='Number of files to download simultaneously.')
     parser.add_argument('--text-only', action='store_true', help='Only download text files (txt/json).')
     parser.add_argument('--specific-file', type=str, default=None, help='Name of the specific file to download (if not provided, downloads all).')
+    parser.add_argument('--exclude-pattern', type=str, default=None, help='Regex pattern to exclude files from download.')
     parser.add_argument('--output', type=str, default=None, help='Save the model files to this folder.')
     parser.add_argument('--model-dir', type=str, default=None, help='Save the model files to a subfolder of this folder instead of the default one (text-generation-webui/models).')
     parser.add_argument('--clean', action='store_true', help='Does not resume the previous download.')
@@ -322,6 +324,7 @@ if __name__ == '__main__':
     branch = args.branch
     model = args.MODEL
     specific_file = args.specific_file
+    exclude_pattern = args.exclude_pattern
 
     if model is None:
         print("Error: Please specify the model you'd like to download (e.g. 'python download-model.py facebook/opt-1.3b').")
@@ -336,7 +339,9 @@ if __name__ == '__main__':
         sys.exit()
 
     # Get the download links from Hugging Face
-    links, sha256, is_lora, is_llamacpp = downloader.get_download_links_from_huggingface(model, branch, text_only=args.text_only, specific_file=specific_file)
+    links, sha256, is_lora, is_llamacpp = downloader.get_download_links_from_huggingface(
+        model, branch, text_only=args.text_only, specific_file=specific_file, exclude_pattern=exclude_pattern
+    )
 
     # Get the output folder
     if args.output:
@@ -349,4 +354,7 @@ if __name__ == '__main__':
         downloader.check_model_files(model, branch, links, sha256, output_folder)
     else:
         # Download files
-        downloader.download_model_files(model, branch, links, sha256, output_folder, specific_file=specific_file, threads=args.threads, is_llamacpp=is_llamacpp)
+        downloader.download_model_files(
+            model, branch, links, sha256, output_folder,
+            specific_file=specific_file, threads=args.threads, is_llamacpp=is_llamacpp
+        )
diff --git a/extensions/Training_PRO/script.py b/extensions/Training_PRO/script.py
index 8f296462..01bcf67d 100644
--- a/extensions/Training_PRO/script.py
+++ b/extensions/Training_PRO/script.py
@@ -241,7 +241,7 @@ def ui():
                         stride_length = gr.Slider(label='Stride', minimum=1, maximum=2048, value=512, step=1, info='Used to make the evaluation faster at the cost of accuracy. 1 = slowest but most accurate. 512 is a common value.')
 
                     with gr.Column():
-                        max_length = gr.Slider(label='max_length', minimum=0, maximum=shared.settings['truncation_length_max'], value=0, step=1, info='The context for each evaluation. If set to 0, the maximum context length for the model will be used.')
+                        max_length = gr.Number(label='max_length', precision=0, step=256, value=0, info='The context for each evaluation. If set to 0, the maximum context length for the model will be used.')
 
                 with gr.Row():
                     start_current_evaluation = gr.Button("Evaluate loaded model")
@@ -789,7 +789,11 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
     if not hasattr(shared.model, 'lm_head') or hasattr(shared.model.lm_head, 'weight'):
         logger.info("Getting model ready...")
         # here we can disable gradient checkpoint, by default = true,  use_gradient_checkpointing=True
-        prepare_model_for_kbit_training(shared.model)
+        if 'quantization_config' in shared.model.config.to_dict():
+            print(f"Method: {RED}QLORA{RESET}")
+            prepare_model_for_kbit_training(shared.model)
+        else:
+            print(f"Method: {RED}LoRA{RESET}")    
 
     # base model is now frozen and should not be reused for any other LoRA training than this one
     shared.model_dirty_from_training = True
diff --git a/extensions/coqui_tts/requirements.txt b/extensions/coqui_tts/requirements.txt
index 747f99a0..b0b691e8 100644
--- a/extensions/coqui_tts/requirements.txt
+++ b/extensions/coqui_tts/requirements.txt
@@ -1 +1 @@
-TTS==0.21.*
\ No newline at end of file
+coqui-tts==0.25.1
diff --git a/extensions/gallery/script.py b/extensions/gallery/script.py
index ff0242c8..76be4a58 100644
--- a/extensions/gallery/script.py
+++ b/extensions/gallery/script.py
@@ -5,7 +5,6 @@ import gradio as gr
 from modules.html_generator import get_image_cache
 from modules.shared import gradio
 
-
 params = {
     'items_per_page': 50,
     'open': False,
@@ -93,10 +92,11 @@ def generate_html():
 
 def filter_cards(filter_str=''):
     if filter_str == '':
-        return cards
+        return gr.Dataset(samples=cards)
 
     filter_upper = filter_str.upper()
-    return [k for k in cards if filter_upper in k[1].upper()]
+    filtered = [k for k in cards if filter_upper in k[1].upper()]
+    return gr.Dataset(samples=filtered)
 
 
 def select_character(evt: gr.SelectData):
diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index 646dee2d..2cefc22b 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -143,20 +143,20 @@ def convert_history(history):
         new_history = []
         for entry in history:
             if isinstance(entry['content'], list):
-                image_url = None
-                content = None
                 for item in entry['content']:
                     if not isinstance(item, dict):
                         continue
-
+                    
+                    image_url = None
+                    content = None
                     if item['type'] == 'image_url' and isinstance(item['image_url'], dict):
                         image_url = item['image_url']['url']
                     elif item['type'] == 'text' and isinstance(item['text'], str):
                         content = item['text']
-
-                if image_url and content:
-                    new_history.append({"image_url": image_url, "role": "user"})
-                    new_history.append({"content": content, "role": "user"})
+                    if image_url:
+                        new_history.append({"image_url": image_url, "role": "user"})
+                    if content:
+                        new_history.append({"content": content, "role": "user"})
             else:
                 new_history.append(entry)
 
@@ -234,7 +234,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
             raise InvalidRequestError(message="messages: missing content", param='messages')
 
     # Chat Completions
-    object_type = 'chat.completions' if not stream else 'chat.completions.chunk'
+    object_type = 'chat.completion' if not stream else 'chat.completion.chunk'
     created_time = int(time.time())
     cmpl_id = "chatcmpl-%d" % (int(time.time() * 1000000000))
     resp_list = 'data' if is_legacy else 'choices'
diff --git a/extensions/openai/script.py b/extensions/openai/script.py
index e85738c3..bbbbae51 100644
--- a/extensions/openai/script.py
+++ b/extensions/openai/script.py
@@ -353,23 +353,38 @@ async def handle_unload_loras():
 
 
 def run_server():
-    server_addr = '0.0.0.0' if shared.args.listen else '127.0.0.1'
+    # Parse configuration
     port = int(os.environ.get('OPENEDAI_PORT', shared.args.api_port))
-
     ssl_certfile = os.environ.get('OPENEDAI_CERT_PATH', shared.args.ssl_certfile)
     ssl_keyfile = os.environ.get('OPENEDAI_KEY_PATH', shared.args.ssl_keyfile)
 
+    # In the server configuration:
+    server_addrs = []
+    if os.environ.get('OPENEDAI_ENABLE_IPV6', shared.args.api_enable_ipv6):
+        server_addrs.append('[::]' if shared.args.listen else '[::1]')
+    if not os.environ.get('OPENEDAI_DISABLE_IPV4', shared.args.api_disable_ipv4):
+        server_addrs.append('0.0.0.0' if shared.args.listen else '127.0.0.1')
+
+    if not server_addrs:
+        raise Exception('you MUST enable IPv6 or IPv4 for the API to work')
+
+    # Log server information
     if shared.args.public_api:
-        def on_start(public_url: str):
-            logger.info(f'OpenAI-compatible API URL:\n\n{public_url}\n')
-
-        _start_cloudflared(port, shared.args.public_api_id, max_attempts=3, on_start=on_start)
+        _start_cloudflared(
+            port,
+            shared.args.public_api_id,
+            max_attempts=3,
+            on_start=lambda url: logger.info(f'OpenAI-compatible API URL:\n\n{url}\n')
+        )
     else:
-        if ssl_keyfile and ssl_certfile:
-            logger.info(f'OpenAI-compatible API URL:\n\nhttps://{server_addr}:{port}\n')
+        url_proto = 'https://' if (ssl_certfile and ssl_keyfile) else 'http://'
+        urls = [f'{url_proto}{addr}:{port}' for addr in server_addrs]
+        if len(urls) > 1:
+            logger.info('OpenAI-compatible API URLs:\n\n' + '\n'.join(urls) + '\n')
         else:
-            logger.info(f'OpenAI-compatible API URL:\n\nhttp://{server_addr}:{port}\n')
+            logger.info('OpenAI-compatible API URL:\n\n' + '\n'.join(urls) + '\n')
 
+    # Log API keys
     if shared.args.api_key:
         if not shared.args.admin_key:
             shared.args.admin_key = shared.args.api_key
@@ -379,8 +394,9 @@ def run_server():
     if shared.args.admin_key and shared.args.admin_key != shared.args.api_key:
         logger.info(f'OpenAI API admin key (for loading/unloading models):\n\n{shared.args.admin_key}\n')
 
+    # Start server
     logging.getLogger("uvicorn.error").propagate = False
-    uvicorn.run(app, host=server_addr, port=port, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile)
+    uvicorn.run(app, host=server_addrs, port=port, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile)
 
 
 def setup():
diff --git a/extensions/openai/typing.py b/extensions/openai/typing.py
index 4015f6a1..5f0e0128 100644
--- a/extensions/openai/typing.py
+++ b/extensions/openai/typing.py
@@ -7,45 +7,48 @@ from pydantic import BaseModel, Field
 
 class GenerationOptions(BaseModel):
     preset: str | None = Field(default=None, description="The name of a file under text-generation-webui/presets (without the .yaml extension). The sampling parameters that get overwritten by this option are the keys in the default_preset() function in modules/presets.py.")
-    min_p: float = 0
-    dynamic_temperature: bool = False
     dynatemp_low: float = 1
     dynatemp_high: float = 1
     dynatemp_exponent: float = 1
     smoothing_factor: float = 0
     smoothing_curve: float = 1
+    min_p: float = 0
     top_k: int = 0
-    repetition_penalty: float = 1
-    repetition_penalty_range: int = 1024
     typical_p: float = 1
-    tfs: float = 1
-    top_a: float = 0
+    xtc_threshold: float = 0.1
+    xtc_probability: float = 0
     epsilon_cutoff: float = 0
     eta_cutoff: float = 0
-    guidance_scale: float = 1
-    negative_prompt: str = ''
+    tfs: float = 1
+    top_a: float = 0
+    dry_multiplier: float = 0
+    dry_allowed_length: int = 2
+    dry_base: float = 1.75
+    repetition_penalty: float = 1
+    encoder_repetition_penalty: float = 1
+    no_repeat_ngram_size: int = 0
+    repetition_penalty_range: int = 1024
     penalty_alpha: float = 0
+    guidance_scale: float = 1
     mirostat_mode: int = 0
     mirostat_tau: float = 5
     mirostat_eta: float = 0.1
-    temperature_last: bool = False
-    do_sample: bool = True
-    seed: int = -1
-    encoder_repetition_penalty: float = 1
-    no_repeat_ngram_size: int = 0
-    dry_multiplier: float = 0
-    dry_base: float = 1.75
-    dry_allowed_length: int = 2
-    dry_sequence_breakers: str = '"\\n", ":", "\\"", "*"'
-    truncation_length: int = 0
-    max_tokens_second: int = 0
     prompt_lookup_num_tokens: int = 0
-    custom_token_bans: str = ""
-    sampler_priority: List[str] | str | None = Field(default=None, description="List of samplers where the first items will appear first in the stack. Example: [\"top_k\", \"temperature\", \"top_p\"].")
+    max_tokens_second: int = 0
+    do_sample: bool = True
+    dynamic_temperature: bool = False
+    temperature_last: bool = False
     auto_max_new_tokens: bool = False
     ban_eos_token: bool = False
     add_bos_token: bool = True
     skip_special_tokens: bool = True
+    static_cache: bool = False
+    truncation_length: int = 0
+    seed: int = -1
+    sampler_priority: List[str] | str | None = Field(default=None, description="List of samplers where the first items will appear first in the stack. Example: [\"top_k\", \"temperature\", \"top_p\"].")
+    custom_token_bans: str = ""
+    negative_prompt: str = ''
+    dry_sequence_breakers: str = '"\\n", ":", "\\"", "*"'
     grammar_string: str = ""
 
 
diff --git a/extensions/whisper_stt/script.py b/extensions/whisper_stt/script.py
index e45c8b1e..d949e93f 100644
--- a/extensions/whisper_stt/script.py
+++ b/extensions/whisper_stt/script.py
@@ -96,7 +96,7 @@ def ui():
             with gr.Accordion("Settings", open=False):
                 auto_submit = gr.Checkbox(label='Submit the transcribed audio automatically', value=params['auto_submit'])
                 device_dropd = gr.Dropdown(label='Device', value=str(startup_device), choices=["cuda", "cpu", "none"])
-                whisper_model_dropd = gr.Dropdown(label='Whisper Model', value=params['whipser_model'], choices=["tiny.en", "base.en", "small.en", "medium.en", "tiny", "base", "small", "medium", "large"])
+                whisper_model_dropd = gr.Dropdown(label='Whisper Model', value=params['whipser_model'], choices=["tiny.en", "base.en", "small.en", "medium.en", "tiny", "base", "small", "medium", "large", "turbo"])
                 whisper_language = gr.Dropdown(label='Whisper Language', value=params['whipser_language'], choices=["english", "chinese", "german", "spanish", "russian", "korean", "french", "japanese", "portuguese", "turkish", "polish", "catalan", "dutch", "arabic", "swedish", "italian", "indonesian", "hindi", "finnish", "vietnamese", "hebrew", "ukrainian", "greek", "malay", "czech", "romanian", "danish", "hungarian", "tamil", "norwegian", "thai", "urdu", "croatian", "bulgarian", "lithuanian", "latin", "maori", "malayalam", "welsh", "slovak", "telugu", "persian", "latvian", "bengali", "serbian", "azerbaijani", "slovenian", "kannada", "estonian", "macedonian", "breton", "basque", "icelandic", "armenian", "nepali", "mongolian", "bosnian", "kazakh", "albanian", "swahili", "galician", "marathi", "punjabi", "sinhala", "khmer", "shona", "yoruba", "somali", "afrikaans", "occitan", "georgian", "belarusian", "tajik", "sindhi", "gujarati", "amharic", "yiddish", "lao", "uzbek", "faroese", "haitian creole", "pashto", "turkmen", "nynorsk", "maltese", "sanskrit", "luxembourgish", "myanmar", "tibetan", "tagalog", "malagasy", "assamese", "tatar", "hawaiian", "lingala", "hausa", "bashkir", "javanese", "sundanese"])
 
     audio.change(
diff --git a/instruction-templates/RWKV-World.yaml b/instruction-templates/RWKV-World.yaml
new file mode 100644
index 00000000..bf65511b
--- /dev/null
+++ b/instruction-templates/RWKV-World.yaml
@@ -0,0 +1,25 @@
+instruction_template: |-
+  {%- set ns = namespace(found=false) -%}
+  {%- for message in messages -%}
+      {%- if message['role'] == 'system' -%}
+          {%- set ns.found = true -%}
+      {%- endif -%}
+  {%- endfor -%}
+  {%- if not ns.found -%}
+      {{- '' + '' + '' -}}
+  {%- endif %}
+  {%- for message in messages %}
+      {%- if message['role'] == 'system' -%}
+          {{- '' + message['content'] + '' -}}
+      {%- else -%}
+          {%- if message['role'] == 'user' -%}
+              {{-'User: ' + message['content'] + '\n\n'-}}
+          {%- else -%}
+              {{-'Assistant: ' + message['content'] + '\n\n' -}}
+          {%- endif -%}
+      {%- endif -%}
+  {%- endfor -%}
+  {%- if add_generation_prompt -%}
+      {{-'Assistant:'-}}
+  {%- endif -%}
+
diff --git a/js/global_scope_js.js b/js/global_scope_js.js
new file mode 100644
index 00000000..6bf0f0e3
--- /dev/null
+++ b/js/global_scope_js.js
@@ -0,0 +1,46 @@
+function copyToClipboard(element) {
+  if (!element) return;
+
+  const messageElement = element.closest(".message, .user-message, .assistant-message");
+  if (!messageElement) return;
+
+  const rawText = messageElement.getAttribute("data-raw");
+  if (!rawText) return;
+
+  navigator.clipboard.writeText(rawText).then(function() {
+    const originalSvg = element.innerHTML;
+    element.innerHTML = "<svg xmlns=\"http://www.w3.org/2000/svg\" width=\"20\" height=\"20\" viewBox=\"0 0 24 24\" fill=\"none\" stroke=\"currentColor\" stroke-width=\"2\" stroke-linecap=\"round\" stroke-linejoin=\"round\" class=\"text-green-500 dark:text-green-400\"><path d=\"M5 12l5 5l10 -10\"></path></svg>";
+    setTimeout(() => {
+      element.innerHTML = originalSvg;
+    }, 1000);
+  }).catch(function(err) {
+    console.error("Failed to copy text: ", err);
+  });
+}
+
+function regenerateClick() {
+  document.getElementById("Regenerate").click();
+}
+
+function handleMorphdomUpdate(text) {
+  morphdom(
+    document.getElementById("chat").parentNode,
+    "<div class=\"prose svelte-1ybaih5\">" + text + "</div>",
+    {
+      onBeforeElUpdated: function(fromEl, toEl) {
+        if (fromEl.tagName === "PRE" && fromEl.querySelector("code[data-highlighted]")) {
+          const fromCode = fromEl.querySelector("code");
+          const toCode = toEl.querySelector("code");
+
+          if (fromCode && toCode && fromCode.textContent === toCode.textContent) {
+            // If the <code> content is the same, preserve the entire <pre> element
+            toEl.className = fromEl.className;
+            toEl.innerHTML = fromEl.innerHTML;
+            return false; // Skip updating the <pre> element
+          }
+        }
+        return !fromEl.isEqualNode(toEl); // Update only if nodes differ
+      }
+    }
+  );
+}
diff --git a/js/main.js b/js/main.js
index 899bd8f0..c5c47d04 100644
--- a/js/main.js
+++ b/js/main.js
@@ -18,16 +18,18 @@ document.querySelector(".header_bar").addEventListener("click", function(event)
   if (extensionsVisible) {
     if (extensions) {
       extensions.style.display = "flex";
-      extensions.style.maxWidth = chatVisible ? "880px" : "none";
-      extensions.style.padding = chatVisible ? "0px" : "15px";
     }
+
     this.style.marginBottom = chatVisible ? "0px" : "19px";
 
     if (chatVisible && !showControlsChecked) {
-      document.querySelectorAll("#chat-tab > div > :nth-child(n+2), #extensions").forEach(element => {
+      document.querySelectorAll(
+        "#chat-tab > div > :nth-child(1), #chat-tab > div > :nth-child(3), #chat-tab > div > :nth-child(4), #extensions"
+      ).forEach(element => {
         element.style.display = "none";
       });
     }
+
   } else {
     this.style.marginBottom = "19px";
     if (extensions) extensions.style.display = "none";
@@ -132,8 +134,7 @@ targetElement.addEventListener("scroll", function() {
 const observer = new MutationObserver(function(mutations) {
   updateCssProperties();
 
-  const firstChild = targetElement.children[0];
-  if (firstChild.classList.contains("generating")) {
+  if (targetElement.classList.contains("_generating")) {
     typing.parentNode.classList.add("visible-dots");
     document.getElementById("stop").style.display = "flex";
     document.getElementById("Generate").style.display = "none";
@@ -146,10 +147,9 @@ const observer = new MutationObserver(function(mutations) {
 
   doSyntaxHighlighting();
 
-  if(!isScrolled) {
+  if (!isScrolled && targetElement.scrollTop !== targetElement.scrollHeight) {
     targetElement.scrollTop = targetElement.scrollHeight;
   }
-
 });
 
 // Configure the observer to watch for changes in the subtree and attributes
@@ -177,47 +177,30 @@ function isElementVisibleOnScreen(element) {
   );
 }
 
-function getVisibleMessagesIndexes() {
-  const elements = document.querySelectorAll(".message-body");
-  const visibleIndexes = [];
-
-  elements.forEach((element, index) => {
-    if (isElementVisibleOnScreen(element) && !element.hasAttribute("data-highlighted")) {
-      visibleIndexes.push(index);
-    }
-  });
-
-  return visibleIndexes;
-}
-
 function doSyntaxHighlighting() {
-  const indexes = getVisibleMessagesIndexes();
-  const elements = document.querySelectorAll(".message-body");
+  const messageBodies = document.querySelectorAll(".message-body");
 
-  if (indexes.length > 0) {
+  if (messageBodies.length > 0) {
     observer.disconnect();
 
-    indexes.forEach((index) => {
-      const element = elements[index];
+    messageBodies.forEach((messageBody) => {
+      if (isElementVisibleOnScreen(messageBody)) {
+        // Handle both code and math in a single pass through each message
+        const codeBlocks = messageBody.querySelectorAll("pre code:not([data-highlighted])");
+        codeBlocks.forEach((codeBlock) => {
+          hljs.highlightElement(codeBlock);
+          codeBlock.setAttribute("data-highlighted", "true");
+        });
 
-      // Tag this element to prevent it from being highlighted twice
-      element.setAttribute("data-highlighted", "true");
-
-      // Perform syntax highlighting
-      const codeBlocks = element.querySelectorAll("pre code");
-
-      codeBlocks.forEach((codeBlock) => {
-        hljs.highlightElement(codeBlock);
-      });
-
-      renderMathInElement(element, {
-        delimiters: [
-          { left: "$$", right: "$$", display: true },
-          { left: "$", right: "$", display: false },
-          { left: "\\(", right: "\\)", display: false },
-          { left: "\\[", right: "\\]", display: true },
-        ],
-      });
+        renderMathInElement(messageBody, {
+          delimiters: [
+            { left: "$$", right: "$$", display: true },
+            { left: "$", right: "$", display: false },
+            { left: "\\(", right: "\\)", display: false },
+            { left: "\\[", right: "\\]", display: true },
+          ],
+        });
+      }
     });
 
     observer.observe(targetElement, config);
@@ -255,7 +238,7 @@ for (i = 0; i < slimDropdownElements.length; i++) {
 // The show/hide events were adapted from:
 // https://github.com/SillyTavern/SillyTavern/blob/6c8bd06308c69d51e2eb174541792a870a83d2d6/public/script.js
 //------------------------------------------------
-var buttonsInChat = document.querySelectorAll("#chat-tab:not(.old-ui) #chat-buttons button");
+var buttonsInChat = document.querySelectorAll("#chat-tab #chat-buttons button");
 var button = document.getElementById("hover-element-button");
 var menu = document.getElementById("hover-menu");
 var istouchscreen = (navigator.maxTouchPoints > 0) || "ontouchstart" in document.documentElement;
@@ -290,12 +273,6 @@ if (buttonsInChat.length > 0) {
       thisButton.innerHTML = newText;
     }
   }
-} else {
-  buttonsInChat = document.querySelectorAll("#chat-tab.old-ui #chat-buttons button");
-  for (let i = 0; i < buttonsInChat.length; i++) {
-    buttonsInChat[i].textContent = buttonsInChat[i].textContent.replace(/ \(.*?\)/, "");
-  }
-  document.getElementById("gr-hover-container").style.display = "none";
 }
 
 function isMouseOverButtonOrMenu() {
@@ -339,6 +316,8 @@ menu.addEventListener("mouseleave", function () {
 
 // Add event listener for click anywhere in the document
 document.addEventListener("click", function (event) {
+  const target = event.target;
+
   // Check if the click is outside the button/menu and the menu is visible
   if (!isMouseOverButtonOrMenu() && menu.style.display === "flex") {
     hideMenu();
@@ -347,6 +326,21 @@ document.addEventListener("click", function (event) {
   if (event.target.classList.contains("pfp_character")) {
     toggleBigPicture();
   }
+
+  // Handle sidebar clicks on mobile
+  if (isMobile()) {
+  // Check if the click did NOT originate from any of the specified toggle buttons or elements
+    if (
+      target.closest("#navigation-toggle") !== navigationToggle &&
+    target.closest("#past-chats-toggle") !== pastChatsToggle &&
+    target.closest("#chat-controls-toggle") !== chatControlsToggle &&
+    target.closest(".header_bar") !== headerBar &&
+    target.closest("#past-chats-row") !== pastChatsRow &&
+    target.closest("#chat-controls") !== chatControlsRow
+    ) {
+      handleIndividualSidebarClose(event);
+    }
+  }
 });
 
 //------------------------------------------------
@@ -361,10 +355,9 @@ for (var i = 0; i < 2; i++) {
 parent.insertBefore(elementToMove, parent.firstChild);
 
 //------------------------------------------------
-// Make the chat input grow upwards instead of downwards
+// Position the chat input
 //------------------------------------------------
-document.getElementById("show-controls").parentNode.style.position = "absolute";
-document.getElementById("show-controls").parentNode.style.bottom = "0px";
+document.getElementById("show-controls").parentNode.classList.add("chat-input-positioned");
 
 //------------------------------------------------
 // Focus on the chat input
@@ -435,35 +428,33 @@ function toggleBigPicture() {
 //------------------------------------------------
 // Handle the chat input box growth
 //------------------------------------------------
-let currentChatInputHeight = 0;
+
+// Cache DOM elements
+const chatContainer = document.getElementById("chat").parentNode.parentNode.parentNode;
+const chatInput = document.querySelector("#chat-input textarea");
+
+// Variables to store current dimensions
+let currentChatInputHeight = chatInput.clientHeight;
 
 // Update chat layout based on chat and input dimensions
 function updateCssProperties() {
-  const chatContainer = document.getElementById("chat").parentNode.parentNode.parentNode;
-  const chatInputHeight = document.querySelector("#chat-input textarea").clientHeight;
+  const chatInputHeight = chatInput.clientHeight;
 
   // Check if the chat container is visible
   if (chatContainer.clientHeight > 0) {
-    var numericHeight = chatContainer.parentNode.clientHeight - chatInputHeight + 40 - 100;
-    if (document.getElementById("chat-tab").style.paddingBottom != "") {
-      numericHeight += 20;
-    }
+    const chatContainerParentHeight = chatContainer.parentNode.clientHeight;
+    const newChatHeight = `${chatContainerParentHeight - chatInputHeight - 80}px`;
 
-    const newChatHeight = `${numericHeight}px`;
     document.documentElement.style.setProperty("--chat-height", newChatHeight);
     document.documentElement.style.setProperty("--input-delta", `${chatInputHeight - 40}px`);
 
-    // Get and set header height
-    const header = document.querySelector(".header_bar");
-    const headerHeight = `${header.clientHeight}px`;
-    document.documentElement.style.setProperty("--header-height", headerHeight);
-
     // Adjust scrollTop based on input height change
     if (chatInputHeight !== currentChatInputHeight) {
-      if (!isScrolled && chatInputHeight < currentChatInputHeight) {
+      const deltaHeight = chatInputHeight - currentChatInputHeight;
+      if (!isScrolled && deltaHeight < 0) {
         chatContainer.scrollTop = chatContainer.scrollHeight;
       } else {
-        chatContainer.scrollTop += chatInputHeight - currentChatInputHeight;
+        chatContainer.scrollTop += deltaHeight;
       }
 
       currentChatInputHeight = chatInputHeight;
@@ -477,18 +468,6 @@ new ResizeObserver(updateCssProperties).observe(document.querySelector("#chat-in
 // Handle changes in window size
 window.addEventListener("resize", updateCssProperties);
 
-//------------------------------------------------
-// Keep track of the display width to position the past
-// chats dropdown on desktop
-//------------------------------------------------
-function updateDocumentWidth() {
-  var updatedWidth = window.innerWidth || document.documentElement.clientWidth || document.body.clientWidth;
-  document.documentElement.style.setProperty("--document-width", updatedWidth + "px");
-}
-
-updateDocumentWidth();
-window.addEventListener("resize", updateDocumentWidth);
-
 //------------------------------------------------
 // Focus on the rename text area when it becomes visible
 //------------------------------------------------
@@ -568,6 +547,8 @@ function moveToChatTab() {
     grandParent.style.display = "none";
   }
 
+  grandParent.children[0].style.minWidth = "100%";
+
   const chatControlsFirstChild = document.querySelector("#chat-controls").firstElementChild;
   const newParent = chatControlsFirstChild;
   let newPosition = newParent.children.length - 2;
@@ -586,6 +567,7 @@ function restoreOriginalPosition() {
 
     document.getElementById("save-character").style.display = "";
     movedElement.style.display = "";
+    movedElement.children[0].style.minWidth = "";
   }
 }
 
@@ -600,4 +582,234 @@ headerBar.addEventListener("click", (e) => {
   }
 });
 
+//------------------------------------------------
+// Add a confirmation dialog when leaving the page
+// Useful to avoid data loss
+//------------------------------------------------
+window.addEventListener("beforeunload", function (event) {
+  // Cancel the event
+  event.preventDefault();
+  // Chrome requires returnValue to be set
+  event.returnValue = "";
+});
+
 moveToChatTab();
+
+//------------------------------------------------
+// Buttons to toggle the sidebars
+//------------------------------------------------
+
+const leftArrowSVG = `
+<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="tabler-icon tabler-icon-arrow-bar-left">
+  <path d="M4 12l10 0"></path>
+  <path d="M4 12l4 4"></path>
+  <path d="M4 12l4 -4"></path>
+  <path d="M20 4l0 16"></path>
+</svg>`;
+
+const rightArrowSVG = `
+<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="tabler-icon tabler-icon-arrow-bar-right">
+  <path d="M20 12l-10 0"></path>
+  <path d="M20 12l-4 4"></path>
+  <path d="M20 12l-4 -4"></path>
+  <path d="M4 4l0 16"></path>
+</svg>`;
+
+const hamburgerMenuSVG = `
+<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="icon icon-hamburger-menu">
+  <line x1="3" y1="12" x2="21" y2="12"></line>
+  <line x1="3" y1="6" x2="21" y2="6"></line>
+  <line x1="3" y1="18" x2="21" y2="18"></line>
+</svg>`;
+
+const closeMenuSVG = `
+<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="icon icon-close-menu">
+  <line x1="18" y1="6" x2="6" y2="18"></line>
+  <line x1="6" y1="6" x2="18" y2="18"></line>
+</svg>`;
+
+const chatTab = document.getElementById("chat-tab");
+const pastChatsRow = document.getElementById("past-chats-row");
+const chatControlsRow = document.getElementById("chat-controls");
+
+if (chatTab) {
+  // Create past-chats-toggle div
+  const pastChatsToggle = document.createElement("div");
+  pastChatsToggle.id = "past-chats-toggle";
+  pastChatsToggle.innerHTML = leftArrowSVG; // Set initial icon to left arrow
+  pastChatsToggle.classList.add("past-chats-open"); // Set initial position
+
+  // Create chat-controls-toggle div
+  const chatControlsToggle = document.createElement("div");
+  chatControlsToggle.id = "chat-controls-toggle";
+  chatControlsToggle.innerHTML = rightArrowSVG; // Set initial icon to right arrow
+  chatControlsToggle.classList.add("chat-controls-open"); // Set initial position
+
+  // Append both elements to the chat-tab
+  chatTab.appendChild(pastChatsToggle);
+  chatTab.appendChild(chatControlsToggle);
+}
+
+// Create navigation toggle div
+const navigationToggle = document.createElement("div");
+navigationToggle.id = "navigation-toggle";
+navigationToggle.innerHTML = leftArrowSVG; // Set initial icon to right arrow
+navigationToggle.classList.add("navigation-left"); // Set initial position
+headerBar.appendChild(navigationToggle);
+
+// Retrieve the dynamically created toggle buttons
+const pastChatsToggle = document.getElementById("past-chats-toggle");
+const chatControlsToggle = document.getElementById("chat-controls-toggle");
+
+function handleIndividualSidebarClose(event) {
+  const target = event.target;
+
+  // Close navigation bar if click is outside and it is open
+  if (!headerBar.contains(target) && !headerBar.classList.contains("sidebar-hidden")) {
+    toggleSidebar(headerBar, navigationToggle, true);
+  }
+
+  // Close past chats row if click is outside and it is open
+  if (!pastChatsRow.contains(target) && !pastChatsRow.classList.contains("sidebar-hidden")) {
+    toggleSidebar(pastChatsRow, pastChatsToggle, true);
+  }
+
+  // Close chat controls row if click is outside and it is open
+  if (!chatControlsRow.contains(target) && !chatControlsRow.classList.contains("sidebar-hidden")) {
+    toggleSidebar(chatControlsRow, chatControlsToggle, true);
+  }
+}
+
+function toggleSidebar(sidebar, toggle, forceClose = false) {
+  const isCurrentlyHidden = sidebar.classList.contains("sidebar-hidden");
+  const shouldClose = !isCurrentlyHidden;
+
+  // Apply visibility classes
+  sidebar.classList.toggle("sidebar-hidden", shouldClose);
+  sidebar.classList.toggle("sidebar-shown", !shouldClose);
+
+  if (sidebar === headerBar) {
+    // Special handling for header bar
+    document.documentElement.style.setProperty("--header-width", shouldClose ? "0px" : "112px");
+    pastChatsRow.classList.toggle("negative-header", shouldClose);
+    pastChatsToggle.classList.toggle("negative-header", shouldClose);
+    toggle.innerHTML = shouldClose ? hamburgerMenuSVG : closeMenuSVG;
+  } else if (sidebar === pastChatsRow) {
+    // Past chats sidebar
+    toggle.classList.toggle("past-chats-closed", shouldClose);
+    toggle.classList.toggle("past-chats-open", !shouldClose);
+    toggle.innerHTML = shouldClose ? rightArrowSVG : leftArrowSVG;
+  } else if (sidebar === chatControlsRow) {
+    // Chat controls sidebar
+    toggle.classList.toggle("chat-controls-closed", shouldClose);
+    toggle.classList.toggle("chat-controls-open", !shouldClose);
+    toggle.innerHTML = shouldClose ? leftArrowSVG : rightArrowSVG;
+  }
+
+  // Mobile handling
+  if (isMobile()) {
+    sidebar.classList.toggle("sidebar-shown", !shouldClose);
+  }
+}
+
+// Function to check if the device is mobile
+function isMobile() {
+  return window.innerWidth <= 924;
+}
+
+// Function to initialize sidebars
+function initializeSidebars() {
+  const isOnMobile = isMobile();
+  
+  if (isOnMobile) {
+    // Mobile state: Hide sidebars and set closed states
+    [pastChatsRow, chatControlsRow, headerBar].forEach(el => {
+      el.classList.add("sidebar-hidden");
+      el.classList.remove("sidebar-shown");
+    });
+
+    document.documentElement.style.setProperty("--header-width", "0px");
+    pastChatsRow.classList.add("negative-header");
+    pastChatsToggle.classList.add("negative-header", "past-chats-closed");
+    pastChatsToggle.classList.remove("past-chats-open");
+
+    [chatControlsToggle, navigationToggle].forEach(el => {
+      el.classList.add("chat-controls-closed");
+      el.classList.remove("chat-controls-open");
+    });
+
+    pastChatsToggle.innerHTML = rightArrowSVG;
+    chatControlsToggle.innerHTML = leftArrowSVG;
+    navigationToggle.innerHTML = hamburgerMenuSVG;
+  } else {
+    // Desktop state: Show sidebars and set open states
+    [pastChatsRow, chatControlsRow].forEach(el => {
+      el.classList.remove("sidebar-hidden", "sidebar-shown");
+    });
+
+    pastChatsToggle.classList.add("past-chats-open");
+    pastChatsToggle.classList.remove("past-chats-closed");
+
+    [chatControlsToggle, navigationToggle].forEach(el => {
+      el.classList.add("chat-controls-open");
+      el.classList.remove("chat-controls-closed");
+    });
+
+    pastChatsToggle.innerHTML = leftArrowSVG;
+    chatControlsToggle.innerHTML = rightArrowSVG;
+    navigationToggle.innerHTML = closeMenuSVG;
+  }
+}
+
+// Run the initializer when the page loads
+initializeSidebars();
+
+// Add click event listeners to toggle buttons
+pastChatsToggle.addEventListener("click", () => {
+  toggleSidebar(pastChatsRow, pastChatsToggle);
+});
+
+chatControlsToggle.addEventListener("click", () => {
+  toggleSidebar(chatControlsRow, chatControlsToggle);
+});
+
+navigationToggle.addEventListener("click", () => {
+  toggleSidebar(headerBar, navigationToggle);
+});
+
+//------------------------------------------------
+// Fixes #chat-input textarea height issue
+// for devices with width <= 924px
+//------------------------------------------------
+
+if (isMobile()) {
+  // Target the textarea
+  const textarea = document.querySelector("#chat-input textarea");
+
+  if (textarea) {
+    // Simulate adding and removing a newline
+    textarea.value += "\n";
+    textarea.dispatchEvent(new Event("input", { bubbles: true }));
+    textarea.value = textarea.value.slice(0, -1);
+    textarea.dispatchEvent(new Event("input", { bubbles: true }));
+  }
+}
+
+//------------------------------------------------
+// Create a top navigation bar on mobile
+//------------------------------------------------
+
+function createMobileTopBar() {
+  const chatTab = document.getElementById("chat-tab");
+
+  // Only create the top bar if it doesn't already exist
+  if (chatTab && !chatTab.querySelector(".mobile-top-bar")) {
+    const topBar = document.createElement("div");
+    topBar.classList.add("mobile-top-bar");
+
+    // Insert the top bar as the first child of chat-tab
+    chatTab.appendChild(topBar);
+  }
+}
+
+createMobileTopBar();
diff --git a/js/morphdom/morphdom-umd.min.js b/js/morphdom/morphdom-umd.min.js
new file mode 100644
index 00000000..6746f0e8
--- /dev/null
+++ b/js/morphdom/morphdom-umd.min.js
@@ -0,0 +1 @@
+(function(global,factory){typeof exports==="object"&&typeof module!=="undefined"?module.exports=factory():typeof define==="function"&&define.amd?define(factory):(global=global||self,global.morphdom=factory())})(this,function(){"use strict";var DOCUMENT_FRAGMENT_NODE=11;function morphAttrs(fromNode,toNode){var toNodeAttrs=toNode.attributes;var attr;var attrName;var attrNamespaceURI;var attrValue;var fromValue;if(toNode.nodeType===DOCUMENT_FRAGMENT_NODE||fromNode.nodeType===DOCUMENT_FRAGMENT_NODE){return}for(var i=toNodeAttrs.length-1;i>=0;i--){attr=toNodeAttrs[i];attrName=attr.name;attrNamespaceURI=attr.namespaceURI;attrValue=attr.value;if(attrNamespaceURI){attrName=attr.localName||attrName;fromValue=fromNode.getAttributeNS(attrNamespaceURI,attrName);if(fromValue!==attrValue){if(attr.prefix==="xmlns"){attrName=attr.name}fromNode.setAttributeNS(attrNamespaceURI,attrName,attrValue)}}else{fromValue=fromNode.getAttribute(attrName);if(fromValue!==attrValue){fromNode.setAttribute(attrName,attrValue)}}}var fromNodeAttrs=fromNode.attributes;for(var d=fromNodeAttrs.length-1;d>=0;d--){attr=fromNodeAttrs[d];attrName=attr.name;attrNamespaceURI=attr.namespaceURI;if(attrNamespaceURI){attrName=attr.localName||attrName;if(!toNode.hasAttributeNS(attrNamespaceURI,attrName)){fromNode.removeAttributeNS(attrNamespaceURI,attrName)}}else{if(!toNode.hasAttribute(attrName)){fromNode.removeAttribute(attrName)}}}}var range;var NS_XHTML="http://www.w3.org/1999/xhtml";var doc=typeof document==="undefined"?undefined:document;var HAS_TEMPLATE_SUPPORT=!!doc&&"content"in doc.createElement("template");var HAS_RANGE_SUPPORT=!!doc&&doc.createRange&&"createContextualFragment"in doc.createRange();function createFragmentFromTemplate(str){var template=doc.createElement("template");template.innerHTML=str;return template.content.childNodes[0]}function createFragmentFromRange(str){if(!range){range=doc.createRange();range.selectNode(doc.body)}var fragment=range.createContextualFragment(str);return fragment.childNodes[0]}function createFragmentFromWrap(str){var fragment=doc.createElement("body");fragment.innerHTML=str;return fragment.childNodes[0]}function toElement(str){str=str.trim();if(HAS_TEMPLATE_SUPPORT){return createFragmentFromTemplate(str)}else if(HAS_RANGE_SUPPORT){return createFragmentFromRange(str)}return createFragmentFromWrap(str)}function compareNodeNames(fromEl,toEl){var fromNodeName=fromEl.nodeName;var toNodeName=toEl.nodeName;var fromCodeStart,toCodeStart;if(fromNodeName===toNodeName){return true}fromCodeStart=fromNodeName.charCodeAt(0);toCodeStart=toNodeName.charCodeAt(0);if(fromCodeStart<=90&&toCodeStart>=97){return fromNodeName===toNodeName.toUpperCase()}else if(toCodeStart<=90&&fromCodeStart>=97){return toNodeName===fromNodeName.toUpperCase()}else{return false}}function createElementNS(name,namespaceURI){return!namespaceURI||namespaceURI===NS_XHTML?doc.createElement(name):doc.createElementNS(namespaceURI,name)}function moveChildren(fromEl,toEl){var curChild=fromEl.firstChild;while(curChild){var nextChild=curChild.nextSibling;toEl.appendChild(curChild);curChild=nextChild}return toEl}function syncBooleanAttrProp(fromEl,toEl,name){if(fromEl[name]!==toEl[name]){fromEl[name]=toEl[name];if(fromEl[name]){fromEl.setAttribute(name,"")}else{fromEl.removeAttribute(name)}}}var specialElHandlers={OPTION:function(fromEl,toEl){var parentNode=fromEl.parentNode;if(parentNode){var parentName=parentNode.nodeName.toUpperCase();if(parentName==="OPTGROUP"){parentNode=parentNode.parentNode;parentName=parentNode&&parentNode.nodeName.toUpperCase()}if(parentName==="SELECT"&&!parentNode.hasAttribute("multiple")){if(fromEl.hasAttribute("selected")&&!toEl.selected){fromEl.setAttribute("selected","selected");fromEl.removeAttribute("selected")}parentNode.selectedIndex=-1}}syncBooleanAttrProp(fromEl,toEl,"selected")},INPUT:function(fromEl,toEl){syncBooleanAttrProp(fromEl,toEl,"checked");syncBooleanAttrProp(fromEl,toEl,"disabled");if(fromEl.value!==toEl.value){fromEl.value=toEl.value}if(!toEl.hasAttribute("value")){fromEl.removeAttribute("value")}},TEXTAREA:function(fromEl,toEl){var newValue=toEl.value;if(fromEl.value!==newValue){fromEl.value=newValue}var firstChild=fromEl.firstChild;if(firstChild){var oldValue=firstChild.nodeValue;if(oldValue==newValue||!newValue&&oldValue==fromEl.placeholder){return}firstChild.nodeValue=newValue}},SELECT:function(fromEl,toEl){if(!toEl.hasAttribute("multiple")){var selectedIndex=-1;var i=0;var curChild=fromEl.firstChild;var optgroup;var nodeName;while(curChild){nodeName=curChild.nodeName&&curChild.nodeName.toUpperCase();if(nodeName==="OPTGROUP"){optgroup=curChild;curChild=optgroup.firstChild}else{if(nodeName==="OPTION"){if(curChild.hasAttribute("selected")){selectedIndex=i;break}i++}curChild=curChild.nextSibling;if(!curChild&&optgroup){curChild=optgroup.nextSibling;optgroup=null}}}fromEl.selectedIndex=selectedIndex}}};var ELEMENT_NODE=1;var DOCUMENT_FRAGMENT_NODE$1=11;var TEXT_NODE=3;var COMMENT_NODE=8;function noop(){}function defaultGetNodeKey(node){if(node){return node.getAttribute&&node.getAttribute("id")||node.id}}function morphdomFactory(morphAttrs){return function morphdom(fromNode,toNode,options){if(!options){options={}}if(typeof toNode==="string"){if(fromNode.nodeName==="#document"||fromNode.nodeName==="HTML"||fromNode.nodeName==="BODY"){var toNodeHtml=toNode;toNode=doc.createElement("html");toNode.innerHTML=toNodeHtml}else{toNode=toElement(toNode)}}else if(toNode.nodeType===DOCUMENT_FRAGMENT_NODE$1){toNode=toNode.firstElementChild}var getNodeKey=options.getNodeKey||defaultGetNodeKey;var onBeforeNodeAdded=options.onBeforeNodeAdded||noop;var onNodeAdded=options.onNodeAdded||noop;var onBeforeElUpdated=options.onBeforeElUpdated||noop;var onElUpdated=options.onElUpdated||noop;var onBeforeNodeDiscarded=options.onBeforeNodeDiscarded||noop;var onNodeDiscarded=options.onNodeDiscarded||noop;var onBeforeElChildrenUpdated=options.onBeforeElChildrenUpdated||noop;var skipFromChildren=options.skipFromChildren||noop;var addChild=options.addChild||function(parent,child){return parent.appendChild(child)};var childrenOnly=options.childrenOnly===true;var fromNodesLookup=Object.create(null);var keyedRemovalList=[];function addKeyedRemoval(key){keyedRemovalList.push(key)}function walkDiscardedChildNodes(node,skipKeyedNodes){if(node.nodeType===ELEMENT_NODE){var curChild=node.firstChild;while(curChild){var key=undefined;if(skipKeyedNodes&&(key=getNodeKey(curChild))){addKeyedRemoval(key)}else{onNodeDiscarded(curChild);if(curChild.firstChild){walkDiscardedChildNodes(curChild,skipKeyedNodes)}}curChild=curChild.nextSibling}}}function removeNode(node,parentNode,skipKeyedNodes){if(onBeforeNodeDiscarded(node)===false){return}if(parentNode){parentNode.removeChild(node)}onNodeDiscarded(node);walkDiscardedChildNodes(node,skipKeyedNodes)}function indexTree(node){if(node.nodeType===ELEMENT_NODE||node.nodeType===DOCUMENT_FRAGMENT_NODE$1){var curChild=node.firstChild;while(curChild){var key=getNodeKey(curChild);if(key){fromNodesLookup[key]=curChild}indexTree(curChild);curChild=curChild.nextSibling}}}indexTree(fromNode);function handleNodeAdded(el){onNodeAdded(el);var curChild=el.firstChild;while(curChild){var nextSibling=curChild.nextSibling;var key=getNodeKey(curChild);if(key){var unmatchedFromEl=fromNodesLookup[key];if(unmatchedFromEl&&compareNodeNames(curChild,unmatchedFromEl)){curChild.parentNode.replaceChild(unmatchedFromEl,curChild);morphEl(unmatchedFromEl,curChild)}else{handleNodeAdded(curChild)}}else{handleNodeAdded(curChild)}curChild=nextSibling}}function cleanupFromEl(fromEl,curFromNodeChild,curFromNodeKey){while(curFromNodeChild){var fromNextSibling=curFromNodeChild.nextSibling;if(curFromNodeKey=getNodeKey(curFromNodeChild)){addKeyedRemoval(curFromNodeKey)}else{removeNode(curFromNodeChild,fromEl,true)}curFromNodeChild=fromNextSibling}}function morphEl(fromEl,toEl,childrenOnly){var toElKey=getNodeKey(toEl);if(toElKey){delete fromNodesLookup[toElKey]}if(!childrenOnly){var beforeUpdateResult=onBeforeElUpdated(fromEl,toEl);if(beforeUpdateResult===false){return}else if(beforeUpdateResult instanceof HTMLElement){fromEl=beforeUpdateResult;indexTree(fromEl)}morphAttrs(fromEl,toEl);onElUpdated(fromEl);if(onBeforeElChildrenUpdated(fromEl,toEl)===false){return}}if(fromEl.nodeName!=="TEXTAREA"){morphChildren(fromEl,toEl)}else{specialElHandlers.TEXTAREA(fromEl,toEl)}}function morphChildren(fromEl,toEl){var skipFrom=skipFromChildren(fromEl,toEl);var curToNodeChild=toEl.firstChild;var curFromNodeChild=fromEl.firstChild;var curToNodeKey;var curFromNodeKey;var fromNextSibling;var toNextSibling;var matchingFromEl;outer:while(curToNodeChild){toNextSibling=curToNodeChild.nextSibling;curToNodeKey=getNodeKey(curToNodeChild);while(!skipFrom&&curFromNodeChild){fromNextSibling=curFromNodeChild.nextSibling;if(curToNodeChild.isSameNode&&curToNodeChild.isSameNode(curFromNodeChild)){curToNodeChild=toNextSibling;curFromNodeChild=fromNextSibling;continue outer}curFromNodeKey=getNodeKey(curFromNodeChild);var curFromNodeType=curFromNodeChild.nodeType;var isCompatible=undefined;if(curFromNodeType===curToNodeChild.nodeType){if(curFromNodeType===ELEMENT_NODE){if(curToNodeKey){if(curToNodeKey!==curFromNodeKey){if(matchingFromEl=fromNodesLookup[curToNodeKey]){if(fromNextSibling===matchingFromEl){isCompatible=false}else{fromEl.insertBefore(matchingFromEl,curFromNodeChild);if(curFromNodeKey){addKeyedRemoval(curFromNodeKey)}else{removeNode(curFromNodeChild,fromEl,true)}curFromNodeChild=matchingFromEl;curFromNodeKey=getNodeKey(curFromNodeChild)}}else{isCompatible=false}}}else if(curFromNodeKey){isCompatible=false}isCompatible=isCompatible!==false&&compareNodeNames(curFromNodeChild,curToNodeChild);if(isCompatible){morphEl(curFromNodeChild,curToNodeChild)}}else if(curFromNodeType===TEXT_NODE||curFromNodeType==COMMENT_NODE){isCompatible=true;if(curFromNodeChild.nodeValue!==curToNodeChild.nodeValue){curFromNodeChild.nodeValue=curToNodeChild.nodeValue}}}if(isCompatible){curToNodeChild=toNextSibling;curFromNodeChild=fromNextSibling;continue outer}if(curFromNodeKey){addKeyedRemoval(curFromNodeKey)}else{removeNode(curFromNodeChild,fromEl,true)}curFromNodeChild=fromNextSibling}if(curToNodeKey&&(matchingFromEl=fromNodesLookup[curToNodeKey])&&compareNodeNames(matchingFromEl,curToNodeChild)){if(!skipFrom){addChild(fromEl,matchingFromEl)}morphEl(matchingFromEl,curToNodeChild)}else{var onBeforeNodeAddedResult=onBeforeNodeAdded(curToNodeChild);if(onBeforeNodeAddedResult!==false){if(onBeforeNodeAddedResult){curToNodeChild=onBeforeNodeAddedResult}if(curToNodeChild.actualize){curToNodeChild=curToNodeChild.actualize(fromEl.ownerDocument||doc)}addChild(fromEl,curToNodeChild);handleNodeAdded(curToNodeChild)}}curToNodeChild=toNextSibling;curFromNodeChild=fromNextSibling}cleanupFromEl(fromEl,curFromNodeChild,curFromNodeKey);var specialElHandler=specialElHandlers[fromEl.nodeName];if(specialElHandler){specialElHandler(fromEl,toEl)}}var morphedNode=fromNode;var morphedNodeType=morphedNode.nodeType;var toNodeType=toNode.nodeType;if(!childrenOnly){if(morphedNodeType===ELEMENT_NODE){if(toNodeType===ELEMENT_NODE){if(!compareNodeNames(fromNode,toNode)){onNodeDiscarded(fromNode);morphedNode=moveChildren(fromNode,createElementNS(toNode.nodeName,toNode.namespaceURI))}}else{morphedNode=toNode}}else if(morphedNodeType===TEXT_NODE||morphedNodeType===COMMENT_NODE){if(toNodeType===morphedNodeType){if(morphedNode.nodeValue!==toNode.nodeValue){morphedNode.nodeValue=toNode.nodeValue}return morphedNode}else{morphedNode=toNode}}}if(morphedNode===toNode){onNodeDiscarded(fromNode)}else{if(toNode.isSameNode&&toNode.isSameNode(morphedNode)){return}morphEl(morphedNode,toNode,childrenOnly);if(keyedRemovalList){for(var i=0,len=keyedRemovalList.length;i<len;i++){var elToRemove=fromNodesLookup[keyedRemovalList[i]];if(elToRemove){removeNode(elToRemove,elToRemove.parentNode,false)}}}}if(!childrenOnly&&morphedNode!==fromNode&&fromNode.parentNode){if(morphedNode.actualize){morphedNode=morphedNode.actualize(fromNode.ownerDocument||doc)}fromNode.parentNode.replaceChild(morphedNode,fromNode)}return morphedNode}}var morphdom=morphdomFactory(morphAttrs);return morphdom});
\ No newline at end of file
diff --git a/js/show_controls.js b/js/show_controls.js
index 1ff88e52..1a87b52d 100644
--- a/js/show_controls.js
+++ b/js/show_controls.js
@@ -1,4 +1,6 @@
-const belowChatInput = document.querySelectorAll("#chat-tab > div > :nth-child(n+2), #extensions");
+const belowChatInput = document.querySelectorAll(
+  "#chat-tab > div > :nth-child(1), #chat-tab > div > :nth-child(3), #chat-tab > div > :nth-child(4), #extensions"
+);
 const chatParent = document.querySelector(".chat-parent");
 
 function toggle_controls(value) {
diff --git a/modules/AutoGPTQ_loader.py b/modules/AutoGPTQ_loader.py
deleted file mode 100644
index 69e8f299..00000000
--- a/modules/AutoGPTQ_loader.py
+++ /dev/null
@@ -1,74 +0,0 @@
-from pathlib import Path
-
-from accelerate.utils import is_xpu_available
-from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
-
-import modules.shared as shared
-from modules.logging_colors import logger
-from modules.models import get_max_memory_dict
-
-
-def load_quantized(model_name):
-    path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
-    pt_path = None
-
-    # Find the model checkpoint
-    if shared.args.checkpoint:
-        pt_path = Path(shared.args.checkpoint)
-    else:
-        for ext in ['.safetensors', '.pt', '.bin']:
-            found = list(path_to_model.glob(f"*{ext}"))
-            if len(found) > 0:
-                if len(found) > 1:
-                    logger.warning(f'More than one {ext} model has been found. The last one will be selected. It could be wrong.')
-
-                pt_path = found[-1]
-                break
-
-    if pt_path is None:
-        logger.error("The model could not be loaded because its checkpoint file in .bin/.pt/.safetensors format could not be located.")
-        return
-
-    use_safetensors = pt_path.suffix == '.safetensors'
-    if not (path_to_model / "quantize_config.json").exists():
-        quantize_config = BaseQuantizeConfig(
-            bits=bits if (bits := shared.args.wbits) > 0 else 4,
-            group_size=gs if (gs := shared.args.groupsize) > 0 else -1,
-            desc_act=shared.args.desc_act
-        )
-    else:
-        quantize_config = None
-
-    # Define the params for AutoGPTQForCausalLM.from_quantized
-    params = {
-        'model_basename': pt_path.stem,
-        'device': "xpu:0" if is_xpu_available() else "cuda:0" if not shared.args.cpu else "cpu",
-        'use_triton': shared.args.triton,
-        'inject_fused_attention': False,
-        'inject_fused_mlp': not shared.args.no_inject_fused_mlp,
-        'use_safetensors': use_safetensors,
-        'trust_remote_code': shared.args.trust_remote_code,
-        'max_memory': get_max_memory_dict(),
-        'quantize_config': quantize_config,
-        'use_cuda_fp16': not shared.args.no_use_cuda_fp16,
-        'disable_exllama': shared.args.disable_exllama,
-        'disable_exllamav2': shared.args.disable_exllamav2,
-    }
-
-    logger.info(f"The AutoGPTQ params are: {params}")
-    model = AutoGPTQForCausalLM.from_quantized(path_to_model, **params)
-
-    # These lines fix the multimodal extension when used with AutoGPTQ
-    if hasattr(model, 'model'):
-        if not hasattr(model, 'dtype'):
-            if hasattr(model.model, 'dtype'):
-                model.dtype = model.model.dtype
-
-        if hasattr(model.model, 'model') and hasattr(model.model.model, 'embed_tokens'):
-            if not hasattr(model, 'embed_tokens'):
-                model.embed_tokens = model.model.model.embed_tokens
-
-            if not hasattr(model.model, 'embed_tokens'):
-                model.model.embed_tokens = model.model.model.embed_tokens
-
-    return model
diff --git a/modules/LoRA.py b/modules/LoRA.py
index 117022cf..1f4883e2 100644
--- a/modules/LoRA.py
+++ b/modules/LoRA.py
@@ -1,18 +1,12 @@
 from pathlib import Path
 
-import torch
-from peft import PeftModel
-from transformers import is_torch_xpu_available
-
 import modules.shared as shared
 from modules.logging_colors import logger
-from modules.models import reload_model
+from modules.models import get_device
 
 
 def add_lora_to_model(lora_names):
-    if 'GPTQForCausalLM' in shared.model.__class__.__name__ or shared.args.loader == 'AutoGPTQ':
-        add_lora_autogptq(lora_names)
-    elif shared.model.__class__.__name__ in ['Exllamav2Model', 'Exllamav2HF'] or shared.args.loader in ['ExLlamav2', 'ExLlamav2_HF']:
+    if shared.model.__class__.__name__ in ['Exllamav2Model', 'Exllamav2HF'] or shared.args.loader in ['ExLlamav2', 'ExLlamav2_HF']:
         add_lora_exllamav2(lora_names)
     else:
         add_lora_transformers(lora_names)
@@ -52,39 +46,10 @@ def add_lora_exllamav2(lora_names):
         shared.model.loras = None
 
 
-def add_lora_autogptq(lora_names):
-    '''
-    Adapted from https://github.com/Ph0rk0z/text-generation-webui-testing
-    '''
-
-    try:
-        from auto_gptq import get_gptq_peft_model
-        from auto_gptq.utils.peft_utils import GPTQLoraConfig
-    except:
-        logger.error("This version of AutoGPTQ does not support LoRA. You need to install from source or wait for a new release.")
-        return
-
-    if len(lora_names) == 0:
-        reload_model()
-
-        shared.lora_names = []
-        return
-    else:
-        if len(lora_names) > 1:
-            logger.warning('AutoGPTQ can only work with 1 LoRA at the moment. Only the first one in the list will be loaded.')
-
-        peft_config = GPTQLoraConfig(
-            inference_mode=True,
-        )
-
-        lora_path = get_lora_path(lora_names[0])
-        logger.info("Applying the following LoRAs to {}: {}".format(shared.model_name, ', '.join([lora_names[0]])))
-        shared.model = get_gptq_peft_model(shared.model, peft_config, lora_path)
-        shared.lora_names = [lora_names[0]]
-        return
-
-
 def add_lora_transformers(lora_names):
+
+    from peft import PeftModel
+
     prior_set = set(shared.lora_names)
     added_set = set(lora_names) - prior_set
     removed_set = prior_set - set(lora_names)
@@ -130,14 +95,9 @@ def add_lora_transformers(lora_names):
         if not shared.args.load_in_8bit and not shared.args.cpu:
             shared.model.half()
             if not hasattr(shared.model, "hf_device_map"):
-                if torch.backends.mps.is_available():
-                    device = torch.device('mps')
+                device = get_device()
+                if device:
                     shared.model = shared.model.to(device)
-                elif is_torch_xpu_available():
-                    device = torch.device("xpu:0")
-                    shared.model = shared.model.to(device)
-                else:
-                    shared.model = shared.model.cuda()
 
     shared.lora_names = lora_names
 
diff --git a/modules/block_requests.py b/modules/block_requests.py
index 886930f0..5a4b533f 100644
--- a/modules/block_requests.py
+++ b/modules/block_requests.py
@@ -3,7 +3,7 @@ import io
 
 import requests
 
-from modules import shared
+from modules import shared, ui
 from modules.logging_colors import logger
 
 original_open = open
@@ -40,14 +40,14 @@ def my_get(url, **kwargs):
 # Kindly provided by our friend WizardLM-30B
 def my_open(*args, **kwargs):
     filename = str(args[0])
-    if filename.endswith('index.html'):
+    if filename.endswith(('index.html', 'share.html')):
         with original_open(*args, **kwargs) as f:
             file_contents = f.read()
 
         if len(args) > 1 and args[1] == 'rb':
             file_contents = file_contents.decode('utf-8')
 
-        file_contents = file_contents.replace('\t\t<script\n\t\t\tsrc="https://cdnjs.cloudflare.com/ajax/libs/iframe-resizer/4.3.9/iframeResizer.contentWindow.min.js"\n\t\t\tasync\n\t\t></script>', '')
+        file_contents = file_contents.replace('\t\t<script\n\t\t\tsrc="https://cdnjs.cloudflare.com/ajax/libs/iframe-resizer/4.3.1/iframeResizer.contentWindow.min.js"\n\t\t\tasync\n\t\t></script>', '')
         file_contents = file_contents.replace('cdnjs.cloudflare.com', '127.0.0.1')
         file_contents = file_contents.replace(
             '</head>',
@@ -55,8 +55,10 @@ def my_open(*args, **kwargs):
             '\n    <script src="file/js/katex/auto-render.min.js"></script>'
             '\n    <script src="file/js/highlightjs/highlight.min.js"></script>'
             '\n    <script src="file/js/highlightjs/highlightjs-copy.min.js"></script>'
+            '\n    <script src="file/js/morphdom/morphdom-umd.min.js"></script>'
             f'\n    <link id="highlight-css" rel="stylesheet" href="file/css/highlightjs/{"github-dark" if shared.settings["dark_theme"] else "github"}.min.css">'
             '\n    <script>hljs.addPlugin(new CopyButtonPlugin());</script>'
+            f'\n    <script>{ui.global_scope_js}</script>'
             '\n  </head>'
         )
 
diff --git a/modules/callbacks.py b/modules/callbacks.py
index 2b039ef1..0f918f3d 100644
--- a/modules/callbacks.py
+++ b/modules/callbacks.py
@@ -1,11 +1,9 @@
-import gc
 import traceback
 from queue import Queue
 from threading import Thread
 
 import torch
 import transformers
-from transformers import is_torch_npu_available, is_torch_xpu_available
 
 import modules.shared as shared
 
@@ -65,7 +63,6 @@ class Iteratorize:
                 traceback.print_exc()
                 pass
 
-            clear_torch_cache()
             self.q.put(self.sentinel)
             if self.c_callback:
                 self.c_callback(ret)
@@ -84,22 +81,10 @@ class Iteratorize:
             return obj
 
     def __del__(self):
-        clear_torch_cache()
+        pass
 
     def __enter__(self):
         return self
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         self.stop_now = True
-        clear_torch_cache()
-
-
-def clear_torch_cache():
-    gc.collect()
-    if not shared.args.cpu:
-        if is_torch_xpu_available():
-            torch.xpu.empty_cache()
-        elif is_torch_npu_available():
-            torch.npu.empty_cache()
-        else:
-            torch.cuda.empty_cache()
diff --git a/modules/chat.py b/modules/chat.py
index 00c4ffa9..694c137b 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -352,13 +352,17 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
     for j, reply in enumerate(generate_reply(prompt, state, stopping_strings=stopping_strings, is_chat=True, for_ui=for_ui)):
 
         # Extract the reply
-        visible_reply = reply
         if state['mode'] in ['chat', 'chat-instruct']:
-            visible_reply = re.sub("(<USER>|<user>|{{user}})", state['name1'], reply)
+            visible_reply = re.sub("(<USER>|<user>|{{user}})", state['name1'], reply + '▍')
+        else:
+            visible_reply = reply + '▍'
 
         visible_reply = html.escape(visible_reply)
 
         if shared.stop_everything:
+            if output['visible'][-1][1].endswith('▍'):
+                output['visible'][-1][1] = output['visible'][-1][1][:-1]
+
             output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True)
             yield output
             return
@@ -374,6 +378,9 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
             if is_stream:
                 yield output
 
+    if output['visible'][-1][1].endswith('▍'):
+        output['visible'][-1][1] = output['visible'][-1][1][:-1]
+
     output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True)
     yield output
 
@@ -586,29 +593,34 @@ def find_all_histories_with_first_prompts(state):
     result = []
     for i, path in enumerate(histories):
         filename = path.stem
-        if re.match(r'^[0-9]{8}-[0-9]{2}-[0-9]{2}-[0-9]{2}$', filename):
-            with open(path, 'r', encoding='utf-8') as f:
-                data = json.load(f)
+        file_content = ""
+        with open(path, 'r', encoding='utf-8') as f:
+            file_content = f.read()
 
-                first_prompt = ""
-                if data and 'visible' in data and len(data['visible']) > 0:
-                    if data['internal'][0][0] == '<|BEGIN-VISIBLE-CHAT|>':
-                        if len(data['visible']) > 1:
-                            first_prompt = html.unescape(data['visible'][1][0])
-                        elif i == 0:
-                            first_prompt = "New chat"
-                    else:
-                        first_prompt = html.unescape(data['visible'][0][0])
-                elif i == 0:
-                    first_prompt = "New chat"
+        if state['search_chat'] and state['search_chat'] not in file_content:
+            continue
+
+        data = json.loads(file_content)
+        if re.match(r'^[0-9]{8}-[0-9]{2}-[0-9]{2}-[0-9]{2}$', filename):
+            first_prompt = ""
+            if data and 'visible' in data and len(data['visible']) > 0:
+                if data['internal'][0][0] == '<|BEGIN-VISIBLE-CHAT|>':
+                    if len(data['visible']) > 1:
+                        first_prompt = html.unescape(data['visible'][1][0])
+                    elif i == 0:
+                        first_prompt = "New chat"
+                else:
+                    first_prompt = html.unescape(data['visible'][0][0])
+            elif i == 0:
+                first_prompt = "New chat"
         else:
             first_prompt = filename
 
         first_prompt = first_prompt.strip()
 
-        # Truncate the first prompt if it's longer than 32 characters
-        if len(first_prompt) > 32:
-            first_prompt = first_prompt[:29] + '...'
+        # Truncate the first prompt if it's longer than 30 characters
+        if len(first_prompt) > 30:
+            first_prompt = first_prompt[:30 - 3] + '...'
 
         result.append((first_prompt, filename))
 
@@ -1059,7 +1071,12 @@ def handle_start_new_chat_click(state):
 
     convert_to_markdown.cache_clear()
 
-    return [history, html, gr.update(choices=histories, value=histories[0][1])]
+    if len(histories) > 0:
+        past_chats_update = gr.update(choices=histories, value=histories[0][1])
+    else:
+        past_chats_update = gr.update(choices=histories)
+
+    return [history, html, past_chats_update]
 
 
 def handle_delete_chat_confirm_click(state):
@@ -1080,11 +1097,25 @@ def handle_delete_chat_confirm_click(state):
     ]
 
 
+def handle_branch_chat_click(state):
+    history = state['history']
+    new_unique_id = datetime.now().strftime('%Y%m%d-%H-%M-%S')
+    save_history(history, new_unique_id, state['character_menu'], state['mode'])
+
+    histories = find_all_histories_with_first_prompts(state)
+    html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
+
+    convert_to_markdown.cache_clear()
+
+    past_chats_update = gr.update(choices=histories, value=new_unique_id)
+
+    return [history, html, past_chats_update]
+
+
 def handle_rename_chat_click():
     return [
-        gr.update(visible=True, value="My New Chat"),
+        gr.update(value="My New Chat"),
         gr.update(visible=True),
-        gr.update(visible=True)
     ]
 
 
@@ -1095,25 +1126,33 @@ def handle_rename_chat_confirm(rename_to, state):
     return [
         gr.update(choices=histories, value=rename_to),
         gr.update(visible=False),
-        gr.update(visible=False),
-        gr.update(visible=False)
     ]
 
 
+def handle_search_chat_change(state):
+    histories = find_all_histories_with_first_prompts(state)
+    return gr.update(choices=histories)
+
+
 def handle_upload_chat_history(load_chat_history, state):
     history = start_new_chat(state)
     history = load_history_json(load_chat_history, history)
-    histories = find_all_histories_with_first_prompts(state)
     save_history(history, state['unique_id'], state['character_menu'], state['mode'])
+    histories = find_all_histories_with_first_prompts(state)
 
     html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
 
     convert_to_markdown.cache_clear()
 
+    if len(histories) > 0:
+        past_chats_update = gr.update(choices=histories, value=histories[0][1])
+    else:
+        past_chats_update = gr.update(choices=histories)
+
     return [
         history,
         html,
-        gr.update(choices=histories, value=histories[0][1])
+        past_chats_update
     ]
 
 
@@ -1132,6 +1171,11 @@ def handle_character_menu_change(state):
 
     convert_to_markdown.cache_clear()
 
+    if len(histories) > 0:
+        past_chats_update = gr.update(choices=histories, value=histories[0][1])
+    else:
+        past_chats_update = gr.update(choices=histories)
+
     return [
         history,
         html,
@@ -1140,7 +1184,7 @@ def handle_character_menu_change(state):
         picture,
         greeting,
         context,
-        gr.update(choices=histories, value=histories[0][1]),
+        past_chats_update,
     ]
 
 
@@ -1151,12 +1195,17 @@ def handle_mode_change(state):
 
     convert_to_markdown.cache_clear()
 
+    if len(histories) > 0:
+        past_chats_update = gr.update(choices=histories, value=histories[0][1])
+    else:
+        past_chats_update = gr.update(choices=histories)
+
     return [
         history,
         html,
         gr.update(visible=state['mode'] != 'instruct'),
         gr.update(visible=state['mode'] == 'chat-instruct'),
-        gr.update(choices=histories, value=histories[0][1])
+        past_chats_update
     ]
 
 
@@ -1189,7 +1238,7 @@ def handle_delete_template_click(template):
     return [
         f"{template}.yaml",
         "instruction-templates/",
-        gr.update(visible=True)
+        gr.update(visible=False)
     ]
 
 
diff --git a/modules/exllamav2.py b/modules/exllamav2.py
index a770e342..0289bb21 100644
--- a/modules/exllamav2.py
+++ b/modules/exllamav2.py
@@ -1,3 +1,4 @@
+import json
 import traceback
 from pathlib import Path
 
@@ -7,6 +8,9 @@ from exllamav2 import (
     ExLlamaV2Cache,
     ExLlamaV2Cache_8bit,
     ExLlamaV2Cache_Q4,
+    ExLlamaV2Cache_Q6,
+    ExLlamaV2Cache_Q8,
+    ExLlamaV2Cache_TP,
     ExLlamaV2Config,
     ExLlamaV2Tokenizer
 )
@@ -18,14 +22,6 @@ from modules.text_generation import get_max_prompt_length
 
 try:
     import flash_attn
-except ModuleNotFoundError:
-    logger.warning(
-        'You are running ExLlamaV2 without flash-attention. This will cause the VRAM usage '
-        'to be a lot higher than it could be.\n'
-        'Try installing flash-attention following the instructions here: '
-        'https://github.com/Dao-AILab/flash-attention#installation-and-features'
-    )
-    pass
 except Exception:
     logger.warning('Failed to load flash-attention due to the following error:\n')
     traceback.print_exc()
@@ -54,21 +50,38 @@ class Exllamav2Model:
 
         model = ExLlamaV2(config)
 
-        if not shared.args.autosplit:
-            split = None
-            if shared.args.gpu_split:
-                split = [float(alloc) for alloc in shared.args.gpu_split.split(",")]
+        split = None
+        if shared.args.gpu_split:
+            split = [float(alloc) for alloc in shared.args.gpu_split.split(",")]
 
+        if shared.args.enable_tp:
+            model.load_tp(split)
+        elif not shared.args.autosplit:
             model.load(split)
 
-        if shared.args.cache_8bit:
-            cache = ExLlamaV2Cache_8bit(model, lazy=shared.args.autosplit)
-        elif shared.args.cache_4bit:
-            cache = ExLlamaV2Cache_Q4(model, lazy=shared.args.autosplit)
-        else:
-            cache = ExLlamaV2Cache(model, lazy=shared.args.autosplit)
+        # Determine the correct cache type
+        kv_cache_type = shared.args.cache_type.lower()
 
-        if shared.args.autosplit:
+        if kv_cache_type == 'fp16':
+            cache_type = ExLlamaV2Cache
+        elif kv_cache_type == 'fp8':
+            cache_type = ExLlamaV2Cache_8bit
+        elif kv_cache_type == 'q8':
+            cache_type = ExLlamaV2Cache_Q8
+        elif kv_cache_type == 'q6':
+            cache_type = ExLlamaV2Cache_Q6
+        elif kv_cache_type == 'q4':
+            cache_type = ExLlamaV2Cache_Q4
+        else:
+            raise ValueError(f"Invalid cache type for ExLlamaV2: {cache_type}. Valid options are: fp16, fp8, q8, q6, q4.")
+
+        # Use TP if specified
+        if shared.args.enable_tp:
+            cache = ExLlamaV2Cache_TP(model, base=cache_type)
+        else:
+            cache = cache_type(model, lazy=shared.args.autosplit)
+
+        if shared.args.autosplit and not shared.args.enable_tp:
             model.load_autosplit(cache)
 
         tokenizer = ExLlamaV2Tokenizer(config)
@@ -110,6 +123,10 @@ class Exllamav2Model:
         settings.token_presence_penalty = state['presence_penalty']
 
         settings.temperature = state['temperature']
+        settings.smoothing_factor = state['smoothing_factor']
+        settings.min_temp = state['dynatemp_low'] if state['dynamic_temperature'] else 0
+        settings.max_temp = state['dynatemp_high'] if state['dynamic_temperature'] else 0
+        settings.temp_exponent = state['dynatemp_exponent']
         settings.top_k = state['top_k']
         settings.top_p = state['top_p']
         settings.top_a = state['top_a']
@@ -131,6 +148,29 @@ class Exllamav2Model:
             if len(to_ban) > 0:
                 settings.disallow_tokens(self.tokenizer, to_ban)
 
+        settings.dry_allowed_length = state['dry_allowed_length']
+        settings.dry_base = state['dry_base']
+        settings.dry_multiplier = state['dry_multiplier']
+
+        # Dry sequence breakers processing
+        if state['dry_multiplier'] > 0 and state['dry_sequence_breakers']:
+            dry_sequence_breakers = state['dry_sequence_breakers']
+
+            # Support both JSON array notation and comma-separated strings.
+            if not dry_sequence_breakers.startswith("["):
+                dry_sequence_breakers = "[" + dry_sequence_breakers + "]"
+
+            sequence_breaker_strings = json.loads(dry_sequence_breakers)
+            # Prefix with 'a' to get the correct encoding of the token at the end of a text.
+            sequence_breakers = {
+                self.encode(f"a{s}")[0, -1].item() for s in sequence_breaker_strings
+            }
+
+            settings.dry_sequence_breakers = sequence_breakers
+
+        settings.xtc_probability = state['xtc_probability']
+        settings.xtc_threshold = state['xtc_threshold']
+
         ids = self.tokenizer.encode(prompt, add_bos=state['add_bos_token'], encode_special_tokens=True)
         ids = ids[:, -get_max_prompt_length(state):]
 
diff --git a/modules/exllamav2_hf.py b/modules/exllamav2_hf.py
index 53143d9a..62d1e054 100644
--- a/modules/exllamav2_hf.py
+++ b/modules/exllamav2_hf.py
@@ -4,30 +4,25 @@ from pathlib import Path
 from typing import Any, Dict, Optional, Union
 
 import torch
+from torch.nn import CrossEntropyLoss
+from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel
+from transformers.modeling_outputs import CausalLMOutputWithPast
+
 from exllamav2 import (
     ExLlamaV2,
     ExLlamaV2Cache,
     ExLlamaV2Cache_8bit,
     ExLlamaV2Cache_Q4,
+    ExLlamaV2Cache_Q6,
+    ExLlamaV2Cache_Q8,
+    ExLlamaV2Cache_TP,
     ExLlamaV2Config
 )
-from torch.nn import CrossEntropyLoss
-from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel
-from transformers.modeling_outputs import CausalLMOutputWithPast
-
 from modules import shared
 from modules.logging_colors import logger
 
 try:
     import flash_attn
-except ModuleNotFoundError:
-    logger.warning(
-        'You are running ExLlamaV2 without flash-attention. This will cause the VRAM usage '
-        'to be a lot higher than it could be.\n'
-        'Try installing flash-attention following the instructions here: '
-        'https://github.com/Dao-AILab/flash-attention#installation-and-features'
-    )
-    pass
 except Exception:
     logger.warning('Failed to load flash-attention due to the following error:\n')
     traceback.print_exc()
@@ -42,21 +37,38 @@ class Exllamav2HF(PreTrainedModel):
 
         self.ex_model = ExLlamaV2(config)
 
-        if not shared.args.autosplit:
-            split = None
-            if shared.args.gpu_split:
-                split = [float(alloc) for alloc in shared.args.gpu_split.split(",")]
+        split = None
+        if shared.args.gpu_split:
+            split = [float(alloc) for alloc in shared.args.gpu_split.split(",")]
 
+        if shared.args.enable_tp:
+            self.ex_model.load_tp(split)
+        elif not shared.args.autosplit:
             self.ex_model.load(split)
 
-        if shared.args.cache_8bit:
-            self.ex_cache = ExLlamaV2Cache_8bit(self.ex_model, lazy=shared.args.autosplit)
-        elif shared.args.cache_4bit:
-            self.ex_cache = ExLlamaV2Cache_Q4(self.ex_model, lazy=shared.args.autosplit)
-        else:
-            self.ex_cache = ExLlamaV2Cache(self.ex_model, lazy=shared.args.autosplit)
+        # Determine the correct cache type
+        kv_cache_type = shared.args.cache_type.lower()
 
-        if shared.args.autosplit:
+        if kv_cache_type == 'fp16':
+            cache_type = ExLlamaV2Cache
+        elif kv_cache_type == 'fp8':
+            cache_type = ExLlamaV2Cache_8bit
+        elif kv_cache_type == 'q8':
+            cache_type = ExLlamaV2Cache_Q8
+        elif kv_cache_type == 'q6':
+            cache_type = ExLlamaV2Cache_Q6
+        elif kv_cache_type == 'q4':
+            cache_type = ExLlamaV2Cache_Q4
+        else:
+            raise ValueError(f"Invalid cache type for ExLlamaV2: {cache_type}. Valid options are: fp16, fp8, q8, q6, q4.")
+
+        # Use TP if specified
+        if shared.args.enable_tp:
+            self.ex_cache = ExLlamaV2Cache_TP(self.ex_model, base=cache_type)
+        else:
+            self.ex_cache = cache_type(self.ex_model, lazy=shared.args.autosplit)
+
+        if shared.args.autosplit and not shared.args.enable_tp:
             self.ex_model.load_autosplit(self.ex_cache)
 
         self.past_seq = None
diff --git a/modules/html_generator.py b/modules/html_generator.py
index d0afd6b2..29973412 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -9,11 +9,35 @@ import markdown
 from PIL import Image, ImageOps
 
 from modules import shared
+from modules.sane_markdown_lists import SaneListExtension
 from modules.utils import get_available_chat_styles
 
 # This is to store the paths to the thumbnails of the profile pictures
 image_cache = {}
 
+
+def minify_css(css: str) -> str:
+    # Step 1: Remove comments
+    css = re.sub(r'/\*.*?\*/', '', css, flags=re.DOTALL)
+
+    # Step 2: Remove leading and trailing whitespace
+    css = re.sub(r'^[ \t]*|[ \t]*$', '', css, flags=re.MULTILINE)
+
+    # Step 3: Remove spaces after specific characters ({ : ; ,})
+    css = re.sub(r'([:{;,])\s+', r'\1', css)
+
+    # Step 4: Remove spaces before `{`
+    css = re.sub(r'\s+{', '{', css)
+
+    # Step 5: Remove empty lines
+    css = re.sub(r'^\s*$', '', css, flags=re.MULTILINE)
+
+    # Step 6: Collapse all lines into one
+    css = re.sub(r'\n', '', css)
+
+    return css
+
+
 with open(Path(__file__).resolve().parent / '../css/html_readable_style.css', 'r') as f:
     readable_css = f.read()
 with open(Path(__file__).resolve().parent / '../css/html_instruct_style.css', 'r') as f:
@@ -34,6 +58,12 @@ for k in chat_styles:
         style = match.group(1)
         chat_styles[k] = chat_styles.get(style, '') + '\n\n' + '\n'.join(lines[1:])
 
+# Reduce the size of the CSS sources above
+readable_css = minify_css(readable_css)
+instruct_css = minify_css(instruct_css)
+for k in chat_styles:
+    chat_styles[k] = minify_css(chat_styles[k])
+
 
 def fix_newlines(string):
     string = string.replace('\n', '\n\n')
@@ -43,7 +73,6 @@ def fix_newlines(string):
 
 
 def replace_quotes(text):
-
     # Define a list of quote pairs (opening and closing), using HTML entities
     quote_pairs = [
         ('&quot;', '&quot;'),  # Double quotes
@@ -54,14 +83,22 @@ def replace_quotes(text):
         ('&lsquo;', '&rsquo;'),  # Alternative single quotes
         ('&#8220;', '&#8221;'),  # Unicode quotes (numeric entities)
         ('&#x201C;', '&#x201D;'),  # Unicode quotes (hex entities)
+        ('\u201C', '\u201D'),  # Unicode quotes (literal chars)
     ]
 
     # Create a regex pattern that matches any of the quote pairs, including newlines
     pattern = '|'.join(f'({re.escape(open_q)})(.*?)({re.escape(close_q)})' for open_q, close_q in quote_pairs)
 
     # Replace matched patterns with <q> tags, keeping original quotes
-    replaced_text = re.sub(pattern, lambda m: f'<q>{m.group(1)}{m.group(2)}{m.group(3)}</q>', text, flags=re.DOTALL)
+    def replacer(m):
+        # Find the first non-None group set
+        for i in range(1, len(m.groups()), 3):  # Step through each sub-pattern's groups
+            if m.group(i):  # If this sub-pattern matched
+                return f'<q>{m.group(i)}{m.group(i + 1)}{m.group(i + 2)}</q>'
 
+        return m.group(0)  # Fallback (shouldn't happen)
+
+    replaced_text = re.sub(pattern, replacer, text, flags=re.DOTALL)
     return replaced_text
 
 
@@ -69,6 +106,52 @@ def replace_blockquote(m):
     return m.group().replace('\n', '\n> ').replace('\\begin{blockquote}', '').replace('\\end{blockquote}', '')
 
 
+def add_long_list_class(html):
+    '''
+    Adds a long-list class to <ul> or <ol> containing long <li> items.
+    These will receive a smaller margin/padding in the CSS.
+    '''
+
+    # Helper function to check if a tag is within <pre> or <code>
+    def is_within_block(start_idx, end_idx, block_matches):
+        return any(start < start_idx < end or start < end_idx < end for start, end in block_matches)
+
+    # Find all <pre>...</pre> and <code>...</code> blocks
+    pre_blocks = [(m.start(), m.end()) for m in re.finditer(r'<pre.*?>.*?</pre>', html, re.DOTALL)]
+    code_blocks = [(m.start(), m.end()) for m in re.finditer(r'<code.*?>.*?</code>', html, re.DOTALL)]
+    all_blocks = pre_blocks + code_blocks
+
+    # Pattern to find <ul>...</ul> and <ol>...</ol> blocks and their contents
+    list_pattern = re.compile(r'(<[uo]l.*?>)(.*?)(</[uo]l>)', re.DOTALL)
+    li_pattern = re.compile(r'<li.*?>(.*?)</li>', re.DOTALL)
+
+    def process_list(match):
+        start_idx, end_idx = match.span()
+        if is_within_block(start_idx, end_idx, all_blocks):
+            return match.group(0)  # Leave the block unchanged if within <pre> or <code>
+
+        opening_tag = match.group(1)
+        list_content = match.group(2)
+        closing_tag = match.group(3)
+
+        # Find all list items within this list
+        li_matches = li_pattern.finditer(list_content)
+        has_long_item = any(len(li_match.group(1).strip()) > 224 for li_match in li_matches)
+
+        if has_long_item:
+            # Add class="long-list" to the opening tag if it doesn't already have a class
+            if 'class=' not in opening_tag:
+                opening_tag = opening_tag[:-1] + ' class="long-list">'
+            else:
+                # If there's already a class, append long-list to it
+                opening_tag = re.sub(r'class="([^"]*)"', r'class="\1 long-list"', opening_tag)
+
+        return opening_tag + list_content + closing_tag
+
+    # Process HTML and replace list blocks
+    return list_pattern.sub(process_list, html)
+
+
 @functools.lru_cache(maxsize=None)
 def convert_to_markdown(string):
 
@@ -104,6 +187,7 @@ def convert_to_markdown(string):
     result = ''
     is_code = False
     is_latex = False
+
     for line in string.split('\n'):
         stripped_line = line.strip()
 
@@ -122,11 +206,14 @@ def convert_to_markdown(string):
 
         result += line
 
-        # Don't add an extra \n for tables, code, or LaTeX
+        # Don't add an extra \n for code, LaTeX, or tables
         if is_code or is_latex or line.startswith('|'):
             result += '\n'
+        # Also don't add an extra \n for lists
+        elif stripped_line.startswith('-') or stripped_line.startswith('*') or stripped_line.startswith('+') or stripped_line.startswith('>') or re.match(r'\d+\.', stripped_line):
+            result += '  \n'
         else:
-            result += '\n\n'
+            result += '  \n'
 
     result = result.strip()
     if is_code:
@@ -145,19 +232,26 @@ def convert_to_markdown(string):
         result = re.sub(list_item_pattern, r'\g<1> ' + delete_str, result)
 
         # Convert to HTML using markdown
-        html_output = markdown.markdown(result, extensions=['fenced_code', 'tables'])
+        html_output = markdown.markdown(result, extensions=['fenced_code', 'tables', SaneListExtension()])
 
         # Remove the delete string from the HTML output
         pos = html_output.rfind(delete_str)
         if pos > -1:
             html_output = html_output[:pos] + html_output[pos + len(delete_str):]
     else:
-        html_output = markdown.markdown(result, extensions=['fenced_code', 'tables'])
+        # Convert to HTML using markdown
+        html_output = markdown.markdown(result, extensions=['fenced_code', 'tables', SaneListExtension()])
 
     # Unescape code blocks
     pattern = re.compile(r'<code[^>]*>(.*?)</code>', re.DOTALL)
     html_output = pattern.sub(lambda x: html.unescape(x.group()), html_output)
 
+    # Unescape backslashes
+    html_output = html_output.replace('\\\\', '\\')
+
+    # Add "long-list" class to <ul> or <ol> containing a long <li> item
+    html_output = add_long_list_class(html_output)
+
     return html_output
 
 
@@ -208,34 +302,43 @@ def get_image_cache(path):
     return image_cache[path][1]
 
 
+copy_svg = '''<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="tabler-icon tabler-icon-copy"><path d="M8 8m0 2a2 2 0 0 1 2 -2h8a2 2 0 0 1 2 2v8a2 2 0 0 1 -2 2h-8a2 2 0 0 1 -2 -2z"></path><path d="M16 8v-2a2 2 0 0 0 -2 -2h-8a2 2 0 0 0 -2 2v8a2 2 0 0 0 2 2h2"></path></svg>'''
+refresh_svg = '''<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="tabler-icon tabler-icon-repeat"><path d="M4 12v-3a3 3 0 0 1 3 -3h13m-3 -3l3 3l-3 3"></path><path d="M20 12v3a3 3 0 0 1 -3 3h-13m3 3l-3 -3l3 -3"></path></svg>'''
+copy_button = f'<button class="footer-button footer-copy-button" onclick="copyToClipboard(this)">{copy_svg}</button>'
+refresh_button = f'<button class="footer-button footer-refresh-button" onclick="regenerateClick()">{refresh_svg}</button>'
+
+
 def generate_instruct_html(history):
     output = f'<style>{instruct_css}</style><div class="chat" id="chat"><div class="messages">'
-    for i, _row in enumerate(history):
-        row = [convert_to_markdown_wrapped(entry, use_cache=i != len(history) - 1) for entry in _row]
 
-        if row[0]:  # don't display empty user messages
-            output += f"""
-                  <div class="user-message">
-                    <div class="text">
-                      <div class="message-body">
-                        {row[0]}
-                      </div>
-                    </div>
-                  </div>
-                """
+    for i in range(len(history['visible'])):
+        row_visible = history['visible'][i]
+        row_internal = history['internal'][i]
+        converted_visible = [convert_to_markdown_wrapped(entry, use_cache=i != len(history['visible']) - 1) for entry in row_visible]
 
-        output += f"""
-              <div class="assistant-message">
-                <div class="text">
-                  <div class="message-body">
-                    {row[1]}
-                  </div>
-                </div>
-              </div>
-            """
+        if converted_visible[0]:  # Don't display empty user messages
+            output += (
+                f'<div class="user-message" '
+                f'data-raw="{html.escape(row_internal[0], quote=True)}">'
+                f'<div class="text">'
+                f'<div class="message-body">{converted_visible[0]}</div>'
+                f'{copy_button}'
+                f'</div>'
+                f'</div>'
+            )
+
+        output += (
+            f'<div class="assistant-message" '
+            f'data-raw="{html.escape(row_internal[1], quote=True)}">'
+            f'<div class="text">'
+            f'<div class="message-body">{converted_visible[1]}</div>'
+            f'{copy_button}'
+            f'{refresh_button if i == len(history["visible"]) - 1 else ""}'
+            f'</div>'
+            f'</div>'
+        )
 
     output += "</div></div>"
-
     return output
 
 
@@ -243,44 +346,46 @@ def generate_cai_chat_html(history, name1, name2, style, character, reset_cache=
     output = f'<style>{chat_styles[style]}</style><div class="chat" id="chat"><div class="messages">'
 
     # We use ?character and ?time.time() to force the browser to reset caches
-    img_bot = f'<img src="file/cache/pfp_character_thumb.png?{character}" class="pfp_character">' if Path("cache/pfp_character_thumb.png").exists() else ''
-    img_me = f'<img src="file/cache/pfp_me.png?{time.time() if reset_cache else ""}">' if Path("cache/pfp_me.png").exists() else ''
+    img_bot = (
+        f'<img src="file/cache/pfp_character_thumb.png?{character}" class="pfp_character">'
+        if Path("cache/pfp_character_thumb.png").exists() else ''
+    )
 
-    for i, _row in enumerate(history):
-        row = [convert_to_markdown_wrapped(entry, use_cache=i != len(history) - 1) for entry in _row]
+    img_me = (
+        f'<img src="file/cache/pfp_me.png?{time.time() if reset_cache else ""}">'
+        if Path("cache/pfp_me.png").exists() else ''
+    )
 
-        if row[0]:  # don't display empty user messages
-            output += f"""
-                  <div class="message">
-                    <div class="circle-you">
-                      {img_me}
-                    </div>
-                    <div class="text">
-                      <div class="username">
-                        {name1}
-                      </div>
-                      <div class="message-body">
-                        {row[0]}
-                      </div>
-                    </div>
-                  </div>
-                """
+    for i in range(len(history['visible'])):
+        row_visible = history['visible'][i]
+        row_internal = history['internal'][i]
+        converted_visible = [convert_to_markdown_wrapped(entry, use_cache=i != len(history['visible']) - 1) for entry in row_visible]
 
-        output += f"""
-              <div class="message">
-                <div class="circle-bot">
-                  {img_bot}
-                </div>
-                <div class="text">
-                  <div class="username">
-                    {name2}
-                  </div>
-                  <div class="message-body">
-                    {row[1]}
-                  </div>
-                </div>
-              </div>
-            """
+        if converted_visible[0]:  # Don't display empty user messages
+            output += (
+                f'<div class="message" '
+                f'data-raw="{html.escape(row_internal[0], quote=True)}">'
+                f'<div class="circle-you">{img_me}</div>'
+                f'<div class="text">'
+                f'<div class="username">{name1}</div>'
+                f'<div class="message-body">{converted_visible[0]}</div>'
+                f'{copy_button}'
+                f'</div>'
+                f'</div>'
+            )
+
+        output += (
+            f'<div class="message" '
+            f'data-raw="{html.escape(row_internal[1], quote=True)}">'
+            f'<div class="circle-bot">{img_bot}</div>'
+            f'<div class="text">'
+            f'<div class="username">{name2}</div>'
+            f'<div class="message-body">{converted_visible[1]}</div>'
+            f'{copy_button}'
+            f'{refresh_button if i == len(history["visible"]) - 1 else ""}'
+            f'</div>'
+            f'</div>'
+        )
 
     output += "</div></div>"
     return output
@@ -289,29 +394,32 @@ def generate_cai_chat_html(history, name1, name2, style, character, reset_cache=
 def generate_chat_html(history, name1, name2, reset_cache=False):
     output = f'<style>{chat_styles["wpp"]}</style><div class="chat" id="chat"><div class="messages">'
 
-    for i, _row in enumerate(history):
-        row = [convert_to_markdown_wrapped(entry, use_cache=i != len(history) - 1) for entry in _row]
+    for i in range(len(history['visible'])):
+        row_visible = history['visible'][i]
+        row_internal = history['internal'][i]
+        converted_visible = [convert_to_markdown_wrapped(entry, use_cache=i != len(history['visible']) - 1) for entry in row_visible]
 
-        if row[0]:  # don't display empty user messages
-            output += f"""
-              <div class="message">
-                <div class="text-you">
-                  <div class="message-body">
-                    {row[0]}
-                  </div>
-                </div>
-              </div>
-            """
+        if converted_visible[0]:  # Don't display empty user messages
+            output += (
+                f'<div class="message" '
+                f'data-raw="{html.escape(row_internal[0], quote=True)}">'
+                f'<div class="text-you">'
+                f'<div class="message-body">{converted_visible[0]}</div>'
+                f'{copy_button}'
+                f'</div>'
+                f'</div>'
+            )
 
-        output += f"""
-          <div class="message">
-            <div class="text-bot">
-              <div class="message-body">
-                {row[1]}
-              </div>
-            </div>
-          </div>
-        """
+        output += (
+            f'<div class="message" '
+            f'data-raw="{html.escape(row_internal[1], quote=True)}">'
+            f'<div class="text-bot">'
+            f'<div class="message-body">{converted_visible[1]}</div>'
+            f'{copy_button}'
+            f'{refresh_button if i == len(history["visible"]) - 1 else ""}'
+            f'</div>'
+            f'</div>'
+        )
 
     output += "</div></div>"
     return output
@@ -319,8 +427,8 @@ def generate_chat_html(history, name1, name2, reset_cache=False):
 
 def chat_html_wrapper(history, name1, name2, mode, style, character, reset_cache=False):
     if mode == 'instruct':
-        return generate_instruct_html(history['visible'])
+        return generate_instruct_html(history)
     elif style == 'wpp':
-        return generate_chat_html(history['visible'], name1, name2)
+        return generate_chat_html(history, name1, name2)
     else:
-        return generate_cai_chat_html(history['visible'], name1, name2, style, character, reset_cache)
+        return generate_cai_chat_html(history, name1, name2, style, character, reset_cache)
diff --git a/modules/llama_cpp_python_hijack.py b/modules/llama_cpp_python_hijack.py
index 64280dc9..f3872a74 100644
--- a/modules/llama_cpp_python_hijack.py
+++ b/modules/llama_cpp_python_hijack.py
@@ -2,17 +2,18 @@ import importlib
 import platform
 from typing import Sequence
 
+import numpy as np
 from tqdm import tqdm
 
 from modules import shared
 from modules.cache_utils import process_llamacpp_cache
 
-
 imported_module = None
+not_available_modules = set()
 
 
 def llama_cpp_lib():
-    global imported_module
+    global imported_module, not_available_modules
 
     # Determine the platform
     is_macos = platform.system() == 'Darwin'
@@ -31,6 +32,9 @@ def llama_cpp_lib():
         ]
 
     for arg, lib_name in lib_names:
+        if lib_name in not_available_modules:
+            continue
+
         should_import = (arg is None or getattr(shared.args, arg))
 
         if should_import:
@@ -44,6 +48,7 @@ def llama_cpp_lib():
                 monkey_patch_llama_cpp_python(return_lib)
                 return return_lib
             except ImportError:
+                not_available_modules.add(lib_name)
                 continue
 
     return None
@@ -57,11 +62,9 @@ def eval_with_progress(self, tokens: Sequence[int]):
 
     with tqdm to show prompt processing progress.
     """
-    assert self._ctx.ctx is not None
-    assert self._batch.batch is not None
     self._ctx.kv_cache_seq_rm(-1, self.n_tokens, -1)
 
-    if len(tokens) > 1:
+    if len(tokens) > self.n_batch:
         progress_bar = tqdm(range(0, len(tokens), self.n_batch), desc="Prompt evaluation", leave=False)
     else:
         progress_bar = range(0, len(tokens), self.n_batch)
@@ -80,13 +83,20 @@ def eval_with_progress(self, tokens: Sequence[int]):
         if self.context_params.logits_all:
             rows = n_tokens
             cols = self._n_vocab
-            logits = self._ctx.get_logits()[: rows * cols]
-            self.scores[n_past : n_past + n_tokens, :].reshape(-1)[: :] = logits
+            logits = np.ctypeslib.as_array(
+                self._ctx.get_logits(), shape=(rows * cols,)
+            )
+            self.scores[n_past : n_past + n_tokens, :].reshape(-1)[::] = logits
+            self.last_updated_index = n_past + n_tokens - 1
         else:
             rows = 1
             cols = self._n_vocab
-            logits = self._ctx.get_logits()[: rows * cols]
-            self.scores[n_past + n_tokens - 1, :].reshape(-1)[: :] = logits
+            logits = np.ctypeslib.as_array(
+                self._ctx.get_logits(), shape=(rows * cols,)
+            )
+            last_token_index = min(n_past + n_tokens - 1, self.scores.shape[0] - 1)
+            self.scores[last_token_index, :] = logits.reshape(-1)
+            self.last_updated_index = last_token_index
         # Update n_tokens
         self.n_tokens += n_tokens
 
diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py
index 327e3a7b..f9964fe8 100644
--- a/modules/llamacpp_hf.py
+++ b/modules/llamacpp_hf.py
@@ -9,6 +9,7 @@ from transformers.modeling_outputs import CausalLMOutputWithPast
 
 from modules import shared
 from modules.llama_cpp_python_hijack import llama_cpp_lib
+from modules.llamacpp_model import get_llamacpp_cache_type_for_string
 from modules.logging_colors import logger
 
 
@@ -127,7 +128,7 @@ class LlamacppHF(PreTrainedModel):
                 self.model.reset()
                 self.model.eval(seq)
 
-            logits = torch.tensor(self.model.scores[self.model.n_tokens - 1, :]).view(1, 1, -1).to(input_ids.device)
+            logits = torch.tensor(self.model.scores[self.model.last_updated_index, :]).view(1, 1, -1).to(input_ids.device)
         else:
             self.model.reset()
             self.model.eval(seq)
@@ -196,14 +197,12 @@ class LlamacppHF(PreTrainedModel):
             'flash_attn': shared.args.flash_attn
         }
 
-        if shared.args.cache_4bit:
-            params["type_k"] = 2
-            params["type_v"] = 2
-        elif shared.args.cache_8bit:
-            params["type_k"] = 8
-            params["type_v"] = 8
+        if shared.args.cache_type != 'fp16':
+            params["type_k"] = get_llamacpp_cache_type_for_string(shared.args.cache_type)
+            params["type_v"] = get_llamacpp_cache_type_for_string(shared.args.cache_type)
 
         Llama = llama_cpp_lib().Llama
         model = Llama(**params)
+        model.last_updated_index = -1
 
         return LlamacppHF(model, model_file)
diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py
index a16230ca..c79755e4 100644
--- a/modules/llamacpp_model.py
+++ b/modules/llamacpp_model.py
@@ -10,6 +10,35 @@ from modules.llama_cpp_python_hijack import llama_cpp_lib
 from modules.logging_colors import logger
 from modules.text_generation import get_max_prompt_length
 
+llamacpp_quant_mapping = {
+    'f32': 0,
+    'fp16': 1,
+    'q4_0': 2,
+    'q4_1': 3,
+    'q5_0': 6,
+    'q5_1': 7,
+    'q8_0': 8,
+    'q8_1': 9,
+    'q2_k': 10,
+    'q3_k': 11,
+    'q4_k': 12,
+    'q5_k': 13,
+    'q6_k': 14,
+    'q8_k': 15,
+    'iq4_nl': 20,
+    'bf16': 30,
+}
+
+llamacpp_valid_cache_types = {'fp16', 'q8_0', 'q4_0'}
+
+
+def get_llamacpp_cache_type_for_string(quant_type: str):
+    quant_type = quant_type.lower()
+    if quant_type in llamacpp_valid_cache_types:
+        return llamacpp_quant_mapping[quant_type]
+    else:
+        raise ValueError(f"Invalid cache type for llama.cpp: {quant_type}. Valid options are: fp16, q8_0, q4_0.")
+
 
 def ban_eos_logits_processor(eos_token, input_ids, logits):
     logits[eos_token] = -float('inf')
@@ -75,12 +104,9 @@ class LlamaCppModel:
             'flash_attn': shared.args.flash_attn
         }
 
-        if shared.args.cache_4bit:
-            params["type_k"] = 2
-            params["type_v"] = 2
-        elif shared.args.cache_8bit:
-            params["type_k"] = 8
-            params["type_v"] = 8
+        if shared.args.cache_type != 'fp16':
+            params["type_k"] = get_llamacpp_cache_type_for_string(shared.args.cache_type)
+            params["type_v"] = get_llamacpp_cache_type_for_string(shared.args.cache_type)
 
         result.model = Llama(**params)
         if cache_capacity > 0:
@@ -96,7 +122,14 @@ class LlamaCppModel:
         return self.model.tokenize(string)
 
     def decode(self, ids, **kwargs):
-        return self.model.detokenize(ids).decode('utf-8')
+        detokenized = self.model.detokenize(ids)
+        try:
+            # Attempt strict UTF-8 decoding first
+            return detokenized.decode('utf-8', 'strict')
+        except UnicodeDecodeError as e:
+            # Log the error and fall back to UTF-8 with replacement
+            logger.warning(f"Invalid UTF-8 in detokenized output. Using replacement characters.\n{e}")
+            return detokenized.decode('utf-8', 'replace')
 
     def get_logits(self, tokens):
         self.model.reset()
@@ -136,7 +169,7 @@ class LlamaCppModel:
             prompt=prompt,
             max_tokens=state['max_new_tokens'],
             temperature=state['temperature'],
-            top_p=state['top_p'],
+            top_p=state['top_p'] if state['top_p'] < 1 else 0.999,
             min_p=state['min_p'],
             typical_p=state['typical_p'],
             frequency_penalty=state['frequency_penalty'],
diff --git a/modules/loaders.py b/modules/loaders.py
index 549de5fb..cd864e40 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -7,126 +7,105 @@ from modules import shared
 
 loaders_and_params = OrderedDict({
     'Transformers': [
-        'cpu_memory',
         'gpu_memory',
-        'load_in_8bit',
-        'bf16',
-        'cpu',
-        'disk',
-        'auto_devices',
-        'load_in_4bit',
-        'use_double_quant',
-        'quant_type',
-        'compute_dtype',
-        'trust_remote_code',
-        'no_use_fast',
-        'use_flash_attention_2',
-        'use_eager_attention',
+        'cpu_memory',
         'alpha_value',
         'compress_pos_emb',
-        'disable_exllama',
-        'disable_exllamav2',
-        'transformers_info',
-    ],
-    'llama.cpp': [
-        'n_ctx',
-        'n_gpu_layers',
-        'cache_8bit',
-        'cache_4bit',
-        'tensor_split',
-        'n_batch',
-        'threads',
-        'threads_batch',
-        'no_mmap',
-        'mlock',
-        'no_mul_mat_q',
-        'rope_freq_base',
-        'compress_pos_emb',
+        'compute_dtype',
+        'quant_type',
+        'load_in_8bit',
+        'load_in_4bit',
+        'torch_compile',
+        'use_flash_attention_2',
+        'auto_devices',
         'cpu',
-        'numa',
-        'no_offload_kqv',
-        'row_split',
-        'tensorcores',
-        'flash_attn',
-        'streaming_llm',
-        'attention_sink_size',
-    ],
-    'llamacpp_HF': [
-        'n_ctx',
-        'n_gpu_layers',
-        'cache_8bit',
-        'cache_4bit',
-        'tensor_split',
-        'n_batch',
-        'threads',
-        'threads_batch',
-        'no_mmap',
-        'mlock',
-        'no_mul_mat_q',
-        'rope_freq_base',
-        'compress_pos_emb',
-        'cpu',
-        'numa',
-        'cfg_cache',
+        'disk',
+        'use_double_quant',
+        'use_eager_attention',
+        'bf16',
+
         'trust_remote_code',
         'no_use_fast',
-        'logits_all',
-        'no_offload_kqv',
-        'row_split',
+    ],
+    'llama.cpp': [
+        'n_gpu_layers',
+        'threads',
+        'threads_batch',
+        'n_batch',
+        'n_ctx',
+        'cache_type',
+        'tensor_split',
+        'rope_freq_base',
+        'compress_pos_emb',
+        'attention_sink_size',
         'tensorcores',
         'flash_attn',
         'streaming_llm',
+        'cpu',
+        'row_split',
+        'no_offload_kqv',
+        'no_mul_mat_q',
+        'no_mmap',
+        'mlock',
+        'numa',
+    ],
+    'llamacpp_HF': [
+        'n_gpu_layers',
+        'threads',
+        'threads_batch',
+        'n_batch',
+        'n_ctx',
+        'cache_type',
+        'tensor_split',
+        'rope_freq_base',
+        'compress_pos_emb',
         'attention_sink_size',
+        'tensorcores',
+        'flash_attn',
+        'streaming_llm',
+        'cpu',
+        'row_split',
+        'no_offload_kqv',
+        'no_mul_mat_q',
+        'no_mmap',
+        'mlock',
+        'numa',
+        'cfg_cache',
+        'logits_all',
+        'trust_remote_code',
+        'no_use_fast',
         'llamacpp_HF_info',
     ],
     'ExLlamav2_HF': [
-        'gpu_split',
         'max_seq_len',
-        'cfg_cache',
+        'cache_type',
+        'gpu_split',
+        'alpha_value',
+        'compress_pos_emb',
+        'num_experts_per_token',
+        'autosplit',
+        'enable_tp',
         'no_flash_attn',
         'no_xformers',
         'no_sdpa',
-        'num_experts_per_token',
-        'cache_8bit',
-        'cache_4bit',
-        'autosplit',
-        'alpha_value',
-        'compress_pos_emb',
+        'cfg_cache',
         'trust_remote_code',
         'no_use_fast',
     ],
     'ExLlamav2': [
-        'gpu_split',
         'max_seq_len',
+        'cache_type',
+        'gpu_split',
+        'alpha_value',
+        'compress_pos_emb',
+        'num_experts_per_token',
+        'autosplit',
+        'enable_tp',
         'no_flash_attn',
         'no_xformers',
         'no_sdpa',
-        'num_experts_per_token',
-        'cache_8bit',
-        'cache_4bit',
-        'autosplit',
-        'alpha_value',
-        'compress_pos_emb',
         'exllamav2_info',
     ],
-    'AutoGPTQ': [
-        'triton',
-        'no_inject_fused_mlp',
-        'no_use_cuda_fp16',
-        'wbits',
-        'groupsize',
-        'desc_act',
-        'disable_exllama',
-        'disable_exllamav2',
-        'gpu_memory',
-        'cpu_memory',
-        'cpu',
-        'disk',
-        'auto_devices',
-        'trust_remote_code',
-        'no_use_fast',
-        'autogptq_info',
-    ],
     'HQQ': [
         'hqq_backend',
         'trust_remote_code',
@@ -143,192 +122,209 @@ loaders_and_params = OrderedDict({
 def transformers_samplers():
     return {
         'temperature',
-        'temperature_last',
-        'dynamic_temperature',
         'dynatemp_low',
         'dynatemp_high',
         'dynatemp_exponent',
         'smoothing_factor',
         'smoothing_curve',
-        'top_p',
         'min_p',
+        'top_p',
         'top_k',
         'typical_p',
+        'xtc_threshold',
+        'xtc_probability',
         'epsilon_cutoff',
         'eta_cutoff',
         'tfs',
         'top_a',
+        'dry_multiplier',
+        'dry_allowed_length',
+        'dry_base',
         'repetition_penalty',
-        'presence_penalty',
         'frequency_penalty',
-        'repetition_penalty_range',
+        'presence_penalty',
         'encoder_repetition_penalty',
         'no_repeat_ngram_size',
-        'dry_multiplier',
-        'dry_base',
-        'dry_allowed_length',
-        'dry_sequence_breakers',
-        'seed',
-        'do_sample',
+        'repetition_penalty_range',
         'penalty_alpha',
+        'guidance_scale',
         'mirostat_mode',
         'mirostat_tau',
         'mirostat_eta',
-        'grammar_file_row',
-        'grammar_string',
-        'guidance_scale',
-        'negative_prompt',
+        'prompt_lookup_num_tokens',
+        'do_sample',
+        'dynamic_temperature',
+        'temperature_last',
+        'auto_max_new_tokens',
         'ban_eos_token',
-        'custom_token_bans',
-        'sampler_priority',
         'add_bos_token',
         'skip_special_tokens',
-        'auto_max_new_tokens',
-        'prompt_lookup_num_tokens'
+        'static_cache',
+        'seed',
+        'sampler_priority',
+        'custom_token_bans',
+        'negative_prompt',
+        'dry_sequence_breakers',
+        'grammar_string',
+        'grammar_file_row',
     }
 
 
 loaders_samplers = {
     'Transformers': transformers_samplers(),
-    'AutoGPTQ': transformers_samplers(),
     'HQQ': transformers_samplers(),
     'ExLlamav2': {
         'temperature',
-        'temperature_last',
-        'top_p',
+        'dynatemp_low',
+        'dynatemp_high',
+        'dynatemp_exponent',
+        'smoothing_factor',
         'min_p',
+        'top_p',
         'top_k',
         'typical_p',
+        'xtc_threshold',
+        'xtc_probability',
         'tfs',
         'top_a',
+        'dry_multiplier',
+        'dry_allowed_length',
+        'dry_base',
         'repetition_penalty',
-        'presence_penalty',
         'frequency_penalty',
+        'presence_penalty',
         'repetition_penalty_range',
-        'seed',
         'mirostat_mode',
         'mirostat_tau',
         'mirostat_eta',
+        'dynamic_temperature',
+        'temperature_last',
+        'auto_max_new_tokens',
         'ban_eos_token',
         'add_bos_token',
-        'custom_token_bans',
         'skip_special_tokens',
-        'auto_max_new_tokens',
+        'seed',
+        'custom_token_bans',
+        'dry_sequence_breakers',
     },
     'ExLlamav2_HF': {
         'temperature',
-        'temperature_last',
-        'dynamic_temperature',
         'dynatemp_low',
         'dynatemp_high',
         'dynatemp_exponent',
         'smoothing_factor',
         'smoothing_curve',
-        'top_p',
         'min_p',
+        'top_p',
         'top_k',
         'typical_p',
+        'xtc_threshold',
+        'xtc_probability',
         'epsilon_cutoff',
         'eta_cutoff',
         'tfs',
         'top_a',
+        'dry_multiplier',
+        'dry_allowed_length',
+        'dry_base',
         'repetition_penalty',
-        'presence_penalty',
         'frequency_penalty',
-        'repetition_penalty_range',
+        'presence_penalty',
         'encoder_repetition_penalty',
         'no_repeat_ngram_size',
-        'dry_multiplier',
-        'dry_base',
-        'dry_allowed_length',
-        'dry_sequence_breakers',
-        'seed',
-        'do_sample',
+        'repetition_penalty_range',
+        'guidance_scale',
         'mirostat_mode',
         'mirostat_tau',
         'mirostat_eta',
-        'grammar_file_row',
-        'grammar_string',
-        'guidance_scale',
-        'negative_prompt',
+        'do_sample',
+        'dynamic_temperature',
+        'temperature_last',
+        'auto_max_new_tokens',
         'ban_eos_token',
-        'custom_token_bans',
-        'sampler_priority',
         'add_bos_token',
         'skip_special_tokens',
-        'auto_max_new_tokens',
+        'seed',
+        'sampler_priority',
+        'custom_token_bans',
+        'negative_prompt',
+        'dry_sequence_breakers',
+        'grammar_string',
+        'grammar_file_row',
     },
     'llama.cpp': {
         'temperature',
-        'top_p',
         'min_p',
+        'top_p',
         'top_k',
         'typical_p',
         'tfs',
         'repetition_penalty',
-        'presence_penalty',
         'frequency_penalty',
-        'seed',
+        'presence_penalty',
         'mirostat_mode',
         'mirostat_tau',
         'mirostat_eta',
-        'grammar_file_row',
-        'grammar_string',
         'ban_eos_token',
+        'seed',
         'custom_token_bans',
+        'grammar_string',
+        'grammar_file_row',
     },
     'llamacpp_HF': {
         'temperature',
-        'temperature_last',
-        'dynamic_temperature',
         'dynatemp_low',
         'dynatemp_high',
         'dynatemp_exponent',
         'smoothing_factor',
         'smoothing_curve',
-        'top_p',
         'min_p',
+        'top_p',
         'top_k',
         'typical_p',
+        'xtc_threshold',
+        'xtc_probability',
         'epsilon_cutoff',
         'eta_cutoff',
         'tfs',
         'top_a',
+        'dry_multiplier',
+        'dry_allowed_length',
+        'dry_base',
         'repetition_penalty',
-        'presence_penalty',
         'frequency_penalty',
-        'repetition_penalty_range',
+        'presence_penalty',
         'encoder_repetition_penalty',
         'no_repeat_ngram_size',
-        'dry_multiplier',
-        'dry_base',
-        'dry_allowed_length',
-        'dry_sequence_breakers',
-        'seed',
-        'do_sample',
+        'repetition_penalty_range',
+        'guidance_scale',
         'mirostat_mode',
         'mirostat_tau',
         'mirostat_eta',
-        'grammar_file_row',
-        'grammar_string',
-        'guidance_scale',
-        'negative_prompt',
+        'do_sample',
+        'dynamic_temperature',
+        'temperature_last',
+        'auto_max_new_tokens',
         'ban_eos_token',
-        'custom_token_bans',
-        'sampler_priority',
         'add_bos_token',
         'skip_special_tokens',
-        'auto_max_new_tokens',
+        'seed',
+        'sampler_priority',
+        'custom_token_bans',
+        'negative_prompt',
+        'dry_sequence_breakers',
+        'grammar_string',
+        'grammar_file_row',
     },
     'TensorRT-LLM': {
         'temperature',
         'top_p',
         'top_k',
         'repetition_penalty',
-        'presence_penalty',
         'frequency_penalty',
-        'ban_eos_token',
+        'presence_penalty',
         'auto_max_new_tokens',
+        'ban_eos_token',
     }
 }
 
diff --git a/modules/logits.py b/modules/logits.py
index 73cabb41..f8a1e80c 100644
--- a/modules/logits.py
+++ b/modules/logits.py
@@ -2,11 +2,10 @@ import time
 import traceback
 
 import torch
-from transformers import is_torch_npu_available, is_torch_xpu_available
 
 from modules import models, sampler_hijack, shared
 from modules.logging_colors import logger
-from modules.models import load_model
+from modules.models import get_device, load_model
 from modules.text_generation import generate_reply
 
 global_scores = None
@@ -57,23 +56,21 @@ def _get_next_logits(prompt, state, use_samplers, previous, top_logits=25, retur
         scores = sampler_hijack.global_scores[-1]
     else:
         if is_non_hf_exllamav2:
-            if is_torch_xpu_available():
-                tokens = shared.tokenizer.encode(prompt).to("xpu:0")
-            elif is_torch_npu_available():
-                tokens = shared.tokenizer.encode(prompt).to("npu:0")
-            else:
-                tokens = shared.tokenizer.encode(prompt).cuda()
+            device = get_device()
+            tokens = shared.tokenizer.encode(prompt)
+            if device:
+                tokens = tokens.to(device)
+
             scores = shared.model.get_logits(tokens)[-1][-1]
         elif is_non_hf_llamacpp:
             tokens = shared.tokenizer.encode(prompt)
             scores = shared.model.get_logits(tokens)[-1][-1]
         else:
-            if is_torch_xpu_available():
-                tokens = shared.tokenizer.encode(prompt, return_tensors='pt').to("xpu:0")
-            elif is_torch_npu_available():
-                tokens = shared.tokenizer.encode(prompt, return_tensors='pt').to("npu:0")
-            else:
-                tokens = shared.tokenizer.encode(prompt, return_tensors='pt').cuda()
+            device = get_device()
+            tokens = shared.tokenizer.encode(prompt, return_tensors='pt')
+            if device:
+                tokens = tokens.to(device)
+
             output = shared.model(input_ids=tokens)
             scores = output['logits'][-1][-1]
 
diff --git a/modules/models.py b/modules/models.py
index b0e2346e..f551b828 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -3,7 +3,6 @@ import os
 import pprint
 import re
 import time
-import traceback
 from pathlib import Path
 
 import torch
@@ -21,11 +20,11 @@ from transformers import (
     AutoModelForSeq2SeqLM,
     AutoTokenizer,
     BitsAndBytesConfig,
-    GPTQConfig
+    is_torch_npu_available,
+    is_torch_xpu_available
 )
 
 import modules.shared as shared
-from modules import sampler_hijack
 from modules.logging_colors import logger
 from modules.models_settings import get_model_metadata
 
@@ -56,8 +55,6 @@ if shared.args.deepspeed:
     ds_config = generate_ds_config(shared.args.bf16, 1 * world_size, shared.args.nvme_offload_dir)
     dschf = HfDeepSpeedConfig(ds_config)  # Keep this object alive for the Transformers integration
 
-sampler_hijack.hijack_samplers()
-
 
 last_generation_time = time.time()
 
@@ -70,7 +67,6 @@ def load_model(model_name, loader=None):
     shared.model_name = model_name
     load_func_map = {
         'Transformers': huggingface_loader,
-        'AutoGPTQ': AutoGPTQ_loader,
         'llama.cpp': llamacpp_loader,
         'llamacpp_HF': llamacpp_HF_loader,
         'ExLlamav2': ExLlamav2_loader,
@@ -90,6 +86,7 @@ def load_model(model_name, loader=None):
                 raise ValueError
 
     shared.args.loader = loader
+    clear_torch_cache()
     output = load_func_map[loader](model_name)
     if type(output) is tuple:
         model, tokenizer = output
@@ -163,30 +160,48 @@ def huggingface_loader(model_name):
         else:
             LoaderClass = AutoModelForCausalLM
 
+    # Determine if we should use default loading
+    should_use_default_loading = not any([
+        shared.args.cpu,
+        shared.args.load_in_8bit,
+        shared.args.load_in_4bit,
+        shared.args.auto_devices,
+        shared.args.disk,
+        shared.args.deepspeed,
+        shared.args.gpu_memory is not None,
+        shared.args.cpu_memory is not None,
+        shared.args.compress_pos_emb > 1,
+        shared.args.alpha_value > 1,
+    ])
+
     # Load the model without any special settings
-    if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.load_in_4bit, shared.args.auto_devices, shared.args.disk, shared.args.deepspeed, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.compress_pos_emb > 1, shared.args.alpha_value > 1, shared.args.disable_exllama, shared.args.disable_exllamav2]):
+    if should_use_default_loading:
         logger.info("TRANSFORMERS_PARAMS=")
         pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(params)
         print()
 
         model = LoaderClass.from_pretrained(path_to_model, **params)
         if not (hasattr(model, 'is_loaded_in_4bit') and model.is_loaded_in_4bit):
-            if torch.backends.mps.is_available():
-                device = torch.device('mps')
+            device = get_device()
+            if device:
                 model = model.to(device)
-            elif is_xpu_available():
-                device = torch.device("xpu")
-                model = model.to(device)
-            elif is_npu_available():
-                device = torch.device("npu")
-                model = model.to(device)
-            else:
-                model = model.cuda()
 
     # DeepSpeed ZeRO-3
     elif shared.args.deepspeed:
-        model = LoaderClass.from_pretrained(path_to_model, torch_dtype=params['torch_dtype'], trust_remote_code=params.get('trust_remote_code'))
-        model = deepspeed.initialize(model=model, config_params=ds_config, model_parameters=None, optimizer=None, lr_scheduler=None)[0]
+        model = LoaderClass.from_pretrained(
+            path_to_model,
+            torch_dtype=params['torch_dtype'],
+            trust_remote_code=params.get('trust_remote_code')
+        )
+
+        model = deepspeed.initialize(
+            model=model,
+            config_params=ds_config,
+            model_parameters=None,
+            optimizer=None,
+            lr_scheduler=None
+        )[0]
+
         model.module.eval()  # Inference
         logger.info(f'DeepSpeed ZeRO-3 is enabled: {is_deepspeed_zero3_enabled()}')
 
@@ -208,16 +223,15 @@ def huggingface_loader(model_name):
                 # and https://huggingface.co/blog/4bit-transformers-bitsandbytes
                 quantization_config_params = {
                     'load_in_4bit': True,
-                    'bnb_4bit_compute_dtype': eval("torch.{}".format(shared.args.compute_dtype)) if shared.args.compute_dtype in ["bfloat16", "float16", "float32"] else None,
+                    'bnb_4bit_compute_dtype': eval(f"torch.{shared.args.compute_dtype}") if shared.args.compute_dtype in ["bfloat16", "float16", "float32"] else None,
                     'bnb_4bit_quant_type': shared.args.quant_type,
                     'bnb_4bit_use_double_quant': shared.args.use_double_quant,
                     'llm_int8_enable_fp32_cpu_offload': True
                 }
-
                 params['quantization_config'] = BitsAndBytesConfig(**quantization_config_params)
 
             elif shared.args.load_in_8bit:
-                if any((shared.args.auto_devices, shared.args.gpu_memory)):
+                if shared.args.auto_devices or shared.args.gpu_memory:
                     params['quantization_config'] = BitsAndBytesConfig(load_in_8bit=True, llm_int8_enable_fp32_cpu_offload=True)
                 else:
                     params['quantization_config'] = BitsAndBytesConfig(load_in_8bit=True)
@@ -237,21 +251,6 @@ def huggingface_loader(model_name):
             if shared.args.disk:
                 params['offload_folder'] = shared.args.disk_cache_dir
 
-        if shared.args.disable_exllama or shared.args.disable_exllamav2:
-            try:
-                gptq_config = GPTQConfig(
-                    bits=config.quantization_config.get('bits', 4),
-                    disable_exllama=shared.args.disable_exllama,
-                    disable_exllamav2=shared.args.disable_exllamav2,
-                )
-
-                params['quantization_config'] = gptq_config
-                logger.info(f'Loading with disable_exllama={shared.args.disable_exllama} and disable_exllamav2={shared.args.disable_exllamav2}.')
-            except:
-                exc = traceback.format_exc()
-                logger.error('Failed to disable exllama. Does the config.json for this model contain the necessary quantization info?')
-                print(exc)
-
         if shared.args.compress_pos_emb > 1:
             params['rope_scaling'] = {'type': 'linear', 'factor': shared.args.compress_pos_emb}
         elif shared.args.alpha_value > 1:
@@ -262,6 +261,9 @@ def huggingface_loader(model_name):
         print()
         model = LoaderClass.from_pretrained(path_to_model, **params)
 
+    if shared.args.torch_compile:
+        model = torch.compile(model)
+
     return model
 
 
@@ -302,12 +304,6 @@ def llamacpp_HF_loader(model_name):
         return model
 
 
-def AutoGPTQ_loader(model_name):
-    import modules.AutoGPTQ_loader
-
-    return modules.AutoGPTQ_loader.load_quantized(model_name)
-
-
 def ExLlamav2_loader(model_name):
     from modules.exllamav2 import Exllamav2Model
 
@@ -322,8 +318,11 @@ def ExLlamav2_HF_loader(model_name):
 
 
 def HQQ_loader(model_name):
-    from hqq.core.quantize import HQQBackend, HQQLinear
-    from hqq.models.hf.base import AutoHQQHFModel
+    try:
+        from hqq.core.quantize import HQQBackend, HQQLinear
+        from hqq.models.hf.base import AutoHQQHFModel
+    except ModuleNotFoundError:
+        raise ModuleNotFoundError("Failed to import 'hqq'. Please install it manually following the instructions in the HQQ GitHub repository.")
 
     logger.info(f"Loading HQQ model with backend: \"{shared.args.hqq_backend}\"")
 
@@ -334,7 +333,10 @@ def HQQ_loader(model_name):
 
 
 def TensorRT_LLM_loader(model_name):
-    from modules.tensorrt_llm import TensorRTLLMModel
+    try:
+        from modules.tensorrt_llm import TensorRTLLMModel
+    except ModuleNotFoundError:
+        raise ModuleNotFoundError("Failed to import 'tensorrt_llm'. Please install it manually following the instructions in the TensorRT-LLM GitHub repository.")
 
     model = TensorRTLLMModel.from_pretrained(model_name)
     return model
@@ -370,13 +372,34 @@ def get_max_memory_dict():
     return max_memory if len(max_memory) > 0 else None
 
 
+def get_device():
+    if torch.cuda.is_available():
+        return torch.device('cuda')
+    elif shared.args.deepspeed:
+        import deepspeed
+        return deepspeed.get_accelerator().current_device_name()
+    elif torch.backends.mps.is_available():
+        return torch.device('mps')
+    elif is_torch_xpu_available():
+        return torch.device('xpu:0')
+    elif is_torch_npu_available():
+        return torch.device('npu:0')
+    else:
+        return None
+
+
 def clear_torch_cache():
     gc.collect()
     if not shared.args.cpu:
-        if is_xpu_available():
-            torch.xpu.empty_cache()
-        else:
+        if torch.cuda.is_available():
             torch.cuda.empty_cache()
+        elif is_xpu_available():
+            torch.xpu.empty_cache()
+        elif is_npu_available():
+            torch.npu.empty_cache()
+        elif torch.backends.mps.is_available():
+            if hasattr(torch.backends.mps, 'empty_cache'):
+                torch.backends.mps.empty_cache()
 
 
 def unload_model(keep_model_name=False):
diff --git a/modules/models_settings.py b/modules/models_settings.py
index 1bb00ceb..8d658523 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -11,9 +11,6 @@ def get_fallback_settings():
     return {
         'bf16': False,
         'use_eager_attention': False,
-        'wbits': 'None',
-        'groupsize': 'None',
-        'desc_act': False,
         'max_seq_len': 2048,
         'n_ctx': 2048,
         'rope_freq_base': 0,
@@ -111,26 +108,6 @@ def get_model_metadata(model):
             if 'architectures' in metadata and isinstance(metadata['architectures'], list) and 'Gemma2ForCausalLM' in metadata['architectures']:
                 model_settings['use_eager_attention'] = True
 
-            # Read GPTQ metadata for old GPTQ loaders
-            if 'quantization_config' in metadata and metadata['quantization_config'].get('quant_method', '') != 'exl2':
-                if 'bits' in metadata['quantization_config']:
-                    model_settings['wbits'] = metadata['quantization_config']['bits']
-                if 'group_size' in metadata['quantization_config']:
-                    model_settings['groupsize'] = metadata['quantization_config']['group_size']
-                if 'desc_act' in metadata['quantization_config']:
-                    model_settings['desc_act'] = metadata['quantization_config']['desc_act']
-
-        # Read AutoGPTQ metadata
-        path = Path(f'{shared.args.model_dir}/{model}/quantize_config.json')
-        if path.exists():
-            metadata = json.loads(open(path, 'r', encoding='utf-8').read())
-            if 'bits' in metadata:
-                model_settings['wbits'] = metadata['bits']
-            if 'group_size' in metadata:
-                model_settings['groupsize'] = metadata['group_size']
-            if 'desc_act' in metadata:
-                model_settings['desc_act'] = metadata['desc_act']
-
     # Try to find the Jinja instruct template
     path = Path(f'{shared.args.model_dir}/{model}') / 'tokenizer_config.json'
     if path.exists():
@@ -178,7 +155,7 @@ def infer_loader(model_name, model_settings):
     path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
     if not path_to_model.exists():
         loader = None
-    elif (path_to_model / 'quantize_config.json').exists() or ('wbits' in model_settings and isinstance(model_settings['wbits'], int) and model_settings['wbits'] > 0):
+    elif (path_to_model / 'quantize_config.json').exists():  # Old GPTQ metadata file
         loader = 'ExLlamav2_HF'
     elif len(list(path_to_model.glob('*.gguf'))) > 0 and path_to_model.is_dir() and (path_to_model / 'tokenizer_config.json').exists():
         loader = 'llamacpp_HF'
@@ -215,16 +192,11 @@ def update_model_parameters(state, initial=False):
         if initial and element in shared.provided_arguments:
             continue
 
-        # Setting null defaults
-        if element in ['wbits', 'groupsize'] and value == 'None':
-            value = vars(shared.args_defaults)[element]
-        elif element in ['cpu_memory'] and value == 0:
+        if element in ['cpu_memory'] and value == 0:
             value = vars(shared.args_defaults)[element]
 
         # Making some simple conversions
-        if element in ['wbits', 'groupsize']:
-            value = int(value)
-        elif element == 'cpu_memory' and value is not None:
+        if element == 'cpu_memory' and value is not None:
             value = f"{value}MiB"
 
         setattr(shared.args, element, value)
@@ -251,15 +223,12 @@ def apply_model_settings_to_state(model, state):
         loader = model_settings.pop('loader')
 
         # If the user is using an alternative loader for the same model type, let them keep using it
-        if not (loader == 'ExLlamav2_HF' and state['loader'] in ['ExLlamav2', 'AutoGPTQ']):
+        if not (loader == 'ExLlamav2_HF' and state['loader'] in ['ExLlamav2']):
             state['loader'] = loader
 
     for k in model_settings:
         if k in state:
-            if k in ['wbits', 'groupsize']:
-                state[k] = str(model_settings[k])
-            else:
-                state[k] = model_settings[k]
+            state[k] = model_settings[k]
 
     return state
 
diff --git a/modules/presets.py b/modules/presets.py
index b00e829e..b841af53 100644
--- a/modules/presets.py
+++ b/modules/presets.py
@@ -13,38 +13,40 @@ from modules.logging_colors import logger
 def default_preset():
     return {
         'temperature': 1,
-        'temperature_last': False,
-        'dynamic_temperature': False,
         'dynatemp_low': 1,
         'dynatemp_high': 1,
         'dynatemp_exponent': 1,
         'smoothing_factor': 0,
         'smoothing_curve': 1,
-        'top_p': 1,
         'min_p': 0,
+        'top_p': 1,
         'top_k': 0,
-        'repetition_penalty': 1,
-        'presence_penalty': 0,
-        'frequency_penalty': 0,
-        'repetition_penalty_range': 1024,
         'typical_p': 1,
-        'tfs': 1,
-        'top_a': 0,
+        'xtc_threshold': 0.1,
+        'xtc_probability': 0,
         'epsilon_cutoff': 0,
         'eta_cutoff': 0,
-        'guidance_scale': 1,
+        'tfs': 1,
+        'top_a': 0,
+        'dry_multiplier': 0,
+        'dry_allowed_length': 2,
+        'dry_base': 1.75,
+        'repetition_penalty': 1,
+        'frequency_penalty': 0,
+        'presence_penalty': 0,
+        'encoder_repetition_penalty': 1,
+        'no_repeat_ngram_size': 0,
+        'repetition_penalty_range': 1024,
         'penalty_alpha': 0,
+        'guidance_scale': 1,
         'mirostat_mode': 0,
         'mirostat_tau': 5,
         'mirostat_eta': 0.1,
         'do_sample': True,
-        'encoder_repetition_penalty': 1,
-        'no_repeat_ngram_size': 0,
-        'dry_multiplier': 0,
-        'dry_base': 1.75,
-        'dry_allowed_length': 2,
+        'dynamic_temperature': False,
+        'temperature_last': False,
+        'sampler_priority': 'repetition_penalty\npresence_penalty\nfrequency_penalty\ndry\ntemperature\ndynamic_temperature\nquadratic_sampling\ntop_k\ntop_p\ntypical_p\nepsilon_cutoff\neta_cutoff\ntfs\ntop_a\nmin_p\nmirostat\nxtc\nencoder_repetition_penalty\nno_repeat_ngram',
         'dry_sequence_breakers': '"\\n", ":", "\\"", "*"',
-        'sampler_priority': 'temperature\ndynamic_temperature\nquadratic_sampling\ntop_k\ntop_p\ntypical_p\nepsilon_cutoff\neta_cutoff\ntfs\ntop_a\nmin_p\nmirostat'
     }
 
 
diff --git a/modules/sampler_hijack.py b/modules/sampler_hijack.py
index 9fb661ae..d202af1f 100644
--- a/modules/sampler_hijack.py
+++ b/modules/sampler_hijack.py
@@ -1,10 +1,11 @@
 import json
 import math
 import pprint
+import random
 
 import torch
 import transformers
-from transformers import LogitsWarper, is_torch_xpu_available
+from transformers import LogitsWarper
 from transformers.generation.logits_process import (
     LogitNormalization,
     LogitsProcessor,
@@ -13,6 +14,7 @@ from transformers.generation.logits_process import (
 
 from modules import shared
 from modules.logging_colors import logger
+from modules.models import get_device
 
 global_scores = None
 
@@ -191,6 +193,53 @@ class TopALogitsWarper(LogitsWarper):
         return scores
 
 
+# Exclude Top Choices (XTC)
+class XTCLogitsWarper(LogitsWarper):
+    def __init__(self, threshold: float, probability: float, filter_value: float = -float("Inf")):
+        self.threshold = threshold
+        self.probability = probability
+        self.filter_value = filter_value
+        self.special_token_ids = [
+            shared.tokenizer.encode("\n")[-1],
+        ]
+
+        if shared.tokenizer.eos_token_id is not None:
+            self.special_token_ids.append(shared.tokenizer.eos_token_id)
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        # `random` returns values in the half-open range [0, 1), so setting `probability`
+        # to 0 means the sampler never takes action, while setting it to 1 means the sampler
+        # always takes action.
+        #
+        # Note that while XTC is most intuitively described as "if multiple tokens meet
+        # the threshold, then with probability...", reversing the two conditions is logically
+        # equivalent, and improves performance because processing can immediately be stopped
+        # if the random check fails.
+        if random.random() >= self.probability:
+            return scores
+
+        sorted_logits, sorted_indices = torch.sort(scores, descending=True)
+        probs = sorted_logits.softmax(dim=-1)
+
+        sorted_indices_to_remove = torch.full_like(probs, False, dtype=torch.bool)
+
+        # This operation sets exactly those indices to `True` for which the next index has
+        # probability above the threshold. Since `probs` is sorted, those are the indices
+        # of all tokens that meet the threshold, *except* the least probable one.
+        sorted_indices_to_remove[..., :-1] = probs[..., 1:] >= self.threshold
+
+        # Convert sorted_indices_to_remove to the original indices
+        indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+
+        # If newline or EOS tokens would be removed, return the original scores
+        if indices_to_remove[:, self.special_token_ids].any():
+            return scores
+
+        # Otherwise, remove tokens with the mask
+        scores = scores.masked_fill(indices_to_remove, self.filter_value)
+        return scores
+
+
 class DRYLogitsProcessor(LogitsProcessor):
     def __init__(self, multiplier: float, base: float, allowed_length: int, sequence_breakers: set[int], _range: int):
         self.multiplier = multiplier
@@ -291,12 +340,12 @@ class MirostatLogitsWarper(LogitsWarper):
                 break
 
         # Normalize the probabilities of the remaining words
-        if is_torch_xpu_available():
-            prob_topk = torch.softmax(sorted_logits, dim=0).to("xpu")
-            prev_i = torch.multinomial(prob_topk, num_samples=1, replacement=True).to("xpu")
-        else:
-            prob_topk = torch.softmax(sorted_logits, dim=0).to('cuda')
-            prev_i = torch.multinomial(prob_topk, num_samples=1, replacement=True).to('cuda')
+        prob_topk = torch.softmax(sorted_logits, dim=0)
+        prev_i = torch.multinomial(prob_topk, num_samples=1, replacement=True)
+        device = get_device()
+        if device:
+            prob_topk = prob_topk.to(device)
+            prev_i = prev_i.to(device)
 
         observed_surprise = -math.log2(prob_topk[prev_i])
         self.e = observed_surprise - self.mirostat_tau
@@ -323,62 +372,143 @@ class SpyLogitsWarper(LogitsWarper):
 
 
 class RepetitionPenaltyLogitsProcessorWithRange(LogitsProcessor):
-    '''
-    Copied from the transformers library
-    '''
-
-    def __init__(self, penalty: float, presence_penalty: float, frequency_penalty: float, _range: int):
+    def __init__(self, penalty: float, _range: int):
         if not (penalty > 0):
             raise ValueError(f"`penalty` has to be strictly positive, but is {penalty}")
-
         self.penalty = penalty
-        self.presence_penalty = presence_penalty
-        self.frequency_penalty = frequency_penalty
         self._range = _range
 
+    def apply_repetition_penalty(self, input_ids_row, scores_row):
+        unique_ids = torch.unique(input_ids_row)
+        score = torch.gather(scores_row, 0, unique_ids)
+
+        # Apply multiplicative repetition penalty
+        score = torch.where(score < 0, score * self.penalty, score / self.penalty)
+        scores_row.scatter_(0, unique_ids, score)
+        return scores_row
+
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         input_ids = input_ids[:, -self._range:]
-
-        # We loop here because torch.unique() needs to process each row separately in the
-        # case that batch_size > 1.
         for input_ids_row, scores_row in zip(input_ids, scores):
-            unique_ids, counts = torch.unique(input_ids_row, return_counts=True)
-            score = torch.gather(scores_row, 0, unique_ids)
-
-            # multiplicative repetition penalty
-            # if score < 0 then repetition penalty has to be multiplied to reduce the previous token probability
-            score = torch.where(score < 0, score * self.penalty, score / self.penalty)
-            scores_row.scatter_(0, unique_ids, score)
-
-            # presence_penalty and frequency_penalty
-            raw_presence_penalty = (counts > 0).to(scores.dtype)
-            raw_frequency_penalty = counts.to(scores.dtype)
-            additive_penalty = raw_presence_penalty * self.presence_penalty + raw_frequency_penalty * self.frequency_penalty
-            scores_row.scatter_add_(0, unique_ids, -additive_penalty)
+            scores_row = self.apply_repetition_penalty(input_ids_row, scores_row)
 
         return scores
 
 
-def get_logits_warper_patch(self, generation_config, **kwargs):
+class PresencePenaltyLogitsProcessor(LogitsProcessor):
+    def __init__(self, presence_penalty: float, _range: int):
+        self.presence_penalty = presence_penalty
+        self._range = _range
+
+    def apply_presence_penalty(self, input_ids_row, scores_row):
+        unique_ids, counts = torch.unique(input_ids_row, return_counts=True)
+
+        # Apply presence penalty
+        raw_presence_penalty = (counts > 0).to(scores_row.dtype)
+        presence_penalty = raw_presence_penalty * self.presence_penalty
+        scores_row.scatter_add_(0, unique_ids, -presence_penalty)
+        return scores_row
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        input_ids = input_ids[:, -self._range:]
+        for input_ids_row, scores_row in zip(input_ids, scores):
+            scores_row = self.apply_presence_penalty(input_ids_row, scores_row)
+        return scores
+
+
+class FrequencyPenaltyLogitsProcessor(LogitsProcessor):
+    def __init__(self, frequency_penalty: float, _range: int):
+        self.frequency_penalty = frequency_penalty
+        self._range = _range
+
+    def apply_frequency_penalty(self, input_ids_row, scores_row):
+        unique_ids, counts = torch.unique(input_ids_row, return_counts=True)
+
+        # Apply frequency penalty
+        raw_frequency_penalty = counts.to(scores_row.dtype)
+        frequency_penalty = raw_frequency_penalty * self.frequency_penalty
+        scores_row.scatter_add_(0, unique_ids, -frequency_penalty)
+        return scores_row
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        input_ids = input_ids[:, -self._range:]
+        for input_ids_row, scores_row in zip(input_ids, scores):
+            scores_row = self.apply_frequency_penalty(input_ids_row, scores_row)
+        return scores
+
+
+def get_logits_processor_patch(self, **kwargs):
+    generation_config = kwargs['generation_config']
 
     # Parameter sanitization
     if isinstance(generation_config.temperature, int):
         generation_config.temperature = float(generation_config.temperature)  # Must be float
 
     # Get the original warpers
-    warpers = self._get_logits_warper_old(generation_config, **kwargs)
+    warpers = self._get_logits_processor_old(**kwargs)
 
-    # Replace temperature with our modified class.
-    # Currently, it behaves identically to the original.
-    for i in range(len(warpers)):
+    for i in range(len(warpers) - 1, -1, -1):
+        # Replace temperature with our modified class.
         if warpers[i].__class__.__name__ == 'TemperatureLogitsWarper':
             warpers[i] = TemperatureLogitsWarperCustom(
                 generation_config.temperature,
             )
 
+        # Stuff we don't need
+        elif warpers[i].__class__.__name__ in ['RepetitionPenaltyLogitsProcessor']:
+            del warpers[i]
+
     # Add custom warpers
     warpers_to_add = LogitsProcessorList()
     min_tokens_to_keep = 2 if generation_config.num_beams > 1 else 1
+
+    if generation_config.repetition_penalty is not None and generation_config.repetition_penalty != 1.0:
+        warpers_to_add.append(
+            RepetitionPenaltyLogitsProcessorWithRange(
+                penalty=generation_config.repetition_penalty,
+                _range=generation_config.repetition_penalty_range
+            )
+        )
+
+    if generation_config.presence_penalty is not None and generation_config.presence_penalty != 0.0:
+        warpers_to_add.append(
+            PresencePenaltyLogitsProcessor(
+                presence_penalty=generation_config.presence_penalty,
+                _range=generation_config.repetition_penalty_range
+            )
+        )
+
+    if generation_config.frequency_penalty is not None and generation_config.frequency_penalty != 0.0:
+        warpers_to_add.append(
+            FrequencyPenaltyLogitsProcessor(
+                frequency_penalty=generation_config.frequency_penalty,
+                _range=generation_config.repetition_penalty_range
+            )
+        )
+
+    if generation_config.dry_multiplier is not None and generation_config.dry_multiplier > 0.0:
+        dry_sequence_breakers = generation_config.dry_sequence_breakers
+
+        # Support both JSON array notation and comma-separated strings.
+        if not dry_sequence_breakers.startswith("["):
+            dry_sequence_breakers = "[" + dry_sequence_breakers + "]"
+
+        sequence_breaker_strings = json.loads(dry_sequence_breakers)
+        # Prefix with 'a' to get the correct encoding of the token at the end of a text.
+        sequence_breakers = {
+            shared.tokenizer.encode(f'a{s}')[-1] for s in sequence_breaker_strings
+        }
+
+        warpers.append(
+            DRYLogitsProcessor(
+                multiplier=generation_config.dry_multiplier,
+                base=generation_config.dry_base,
+                allowed_length=generation_config.dry_allowed_length,
+                sequence_breakers=sequence_breakers,
+                _range=generation_config.repetition_penalty_range,
+            )
+        )
+
     if generation_config.tfs is not None and 0.0 <= generation_config.tfs < 1.0:
         warpers_to_add.append(
             TailFreeLogitsWarper(
@@ -395,6 +525,14 @@ def get_logits_warper_patch(self, generation_config, **kwargs):
             )
         )
 
+    if generation_config.xtc_probability is not None and generation_config.xtc_probability > 0:
+        warpers_to_add.append(
+            XTCLogitsWarper(
+                threshold=generation_config.xtc_threshold,
+                probability=generation_config.xtc_probability,
+            )
+        )
+
     if generation_config.dynamic_temperature:
         warpers_to_add.append(
             DynamicTemperatureLogitsWarper(
@@ -436,11 +574,10 @@ def get_logits_warper_patch(self, generation_config, **kwargs):
     if generation_config.temperature_last:
         for param_name in ['temperature', 'dynamic_temperature', 'quadratic_sampling']:
             if param_name in sampler_priority:
-                if param_name in sampler_priority:
-                    index = sampler_priority.index(param_name)
-                    sampler_priority.append(sampler_priority.pop(index))
-                else:
-                    sampler_priority.append(param_name)
+                index = sampler_priority.index(param_name)
+                sampler_priority.append(sampler_priority.pop(index))
+            else:
+                sampler_priority.append(param_name)
 
     class_name_to_nickname = {
         'DynamicTemperatureLogitsWarper': 'dynamic_temperature',
@@ -454,17 +591,23 @@ def get_logits_warper_patch(self, generation_config, **kwargs):
         'TopALogitsWarper': 'top_a',
         'TopKLogitsWarper': 'top_k',
         'TopPLogitsWarper': 'top_p',
-        'TypicalLogitsWarper': 'typical_p'
+        'TypicalLogitsWarper': 'typical_p',
+        'XTCLogitsWarper': 'xtc',
+        'RepetitionPenaltyLogitsProcessorWithRange': 'repetition_penalty',
+        'PresencePenaltyLogitsProcessor': 'presence_penalty',
+        'FrequencyPenaltyLogitsProcessor': 'frequency_penalty',
+        'DRYLogitsProcessor': 'dry',
+        'EncoderRepetitionPenaltyLogitsProcessor': 'encoder_repetition_penalty',
+        'NoRepeatNGramLogitsProcessor': 'no_repeat_ngram',
     }
 
     def custom_sort_key(obj):
         class_name = obj.__class__.__name__
 
-        # Return a large value if class name is not mapped or if the mapped nickname is not in priority
+        # Return -1 if class_name is not mapped
         if class_name not in class_name_to_nickname or class_name_to_nickname[class_name] not in sampler_priority:
-            return float('inf')
+            return -1
 
-        # Return the index of the nickname in the priority list for sorting
         return sampler_priority.index(class_name_to_nickname[class_name])
 
     # Sort the list using the custom key function
@@ -482,49 +625,6 @@ def get_logits_warper_patch(self, generation_config, **kwargs):
     return warpers
 
 
-def get_logits_processor_patch(self, **kwargs):
-    generation_config = kwargs['generation_config']
-
-    do_rep_pen_hijack = (generation_config.repetition_penalty > 1) or (generation_config.presence_penalty != 0) or (generation_config.frequency_penalty != 0)
-    if do_rep_pen_hijack:
-        generation_config.repetition_penalty = 1.1  # Set to value > 1 to ensure RepetitionPenaltyLogitsProcessor is created
-
-    result = self._get_logits_processor_old(**kwargs)
-
-    if do_rep_pen_hijack:
-        for i in range(len(result)):
-            if result[i].__class__.__name__ == 'RepetitionPenaltyLogitsProcessor':
-                result[i] = RepetitionPenaltyLogitsProcessorWithRange(
-                    generation_config.repetition_penalty,
-                    generation_config.presence_penalty,
-                    generation_config.frequency_penalty,
-                    generation_config.repetition_penalty_range
-                )
-
-    if generation_config.dry_multiplier is not None and generation_config.dry_multiplier > 0.0:
-        dry_sequence_breakers = generation_config.dry_sequence_breakers
-
-        # Support both JSON array notation and comma-separated strings.
-        if not dry_sequence_breakers.startswith("["):
-            dry_sequence_breakers = "[" + dry_sequence_breakers + "]"
-
-        sequence_breaker_strings = json.loads(dry_sequence_breakers)
-        # Prefix with 'a' to get the correct encoding of the token at the end of a text.
-        sequence_breakers = {shared.tokenizer.encode(f'a{s}')[-1] for s in sequence_breaker_strings}
-
-        result.append(
-            DRYLogitsProcessor(
-                multiplier=generation_config.dry_multiplier,
-                base=generation_config.dry_base,
-                allowed_length=generation_config.dry_allowed_length,
-                sequence_breakers=sequence_breakers,
-                _range=generation_config.repetition_penalty_range,
-            )
-        )
-
-    return result
-
-
 def generation_config_init_patch(self, **kwargs):
     self.__init___old(**kwargs)
     self.min_p = kwargs.pop("min_p", 0.0)
@@ -546,14 +646,13 @@ def generation_config_init_patch(self, **kwargs):
     self.dry_base = kwargs.pop("dry_base", 1.75)
     self.dry_allowed_length = kwargs.pop("dry_allowed_length", 2)
     self.dry_sequence_breakers = kwargs.pop("dry_sequence_breakers", '"\\n", ":", "\\"", "*"')
+    self.xtc_threshold = kwargs.pop("xtc_threshold", 0.1)
+    self.xtc_probability = kwargs.pop("xtc_probability", 0)
     self.temperature_last = kwargs.pop("temperature_last", False)
-    self.sampler_priority = kwargs.pop("sampler_priority", ['temperature', 'dynamic_temperature', 'quadratic_sampling', 'top_k', 'top_p', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'tfs', 'top_a', 'min_p', 'mirostat'])
+    self.sampler_priority = kwargs.pop("sampler_priority", ['repetition_penalty', 'presence_penalty', 'frequency_penalty', 'dry', 'temperature', 'dynamic_temperature', 'quadratic_sampling', 'top_k', 'top_p', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'tfs', 'top_a', 'min_p', 'mirostat', 'xtc', 'encoder_repetition_penalty', 'no_repeat_ngram'])
 
 
 def hijack_samplers():
-    transformers.GenerationMixin._get_logits_warper_old = transformers.GenerationMixin._get_logits_warper
-    transformers.GenerationMixin._get_logits_warper = get_logits_warper_patch
-
     transformers.GenerationMixin._get_logits_processor_old = transformers.GenerationMixin._get_logits_processor
     transformers.GenerationMixin._get_logits_processor = get_logits_processor_patch
 
diff --git a/modules/sane_markdown_lists.py b/modules/sane_markdown_lists.py
new file mode 100644
index 00000000..1e1d76fd
--- /dev/null
+++ b/modules/sane_markdown_lists.py
@@ -0,0 +1,336 @@
+# Code based on the Sane List Extension for Python-Markdown
+# =======================================
+
+# Modify the behavior of Lists in Python-Markdown to act in a sane manner.
+
+# See https://Python-Markdown.github.io/extensions/sane_lists
+# for documentation.
+
+# Original code Copyright 2011 [Waylan Limberg](http://achinghead.com)
+
+# All changes Copyright 2011-2014 The Python Markdown Project
+
+# License: [BSD](https://opensource.org/licenses/bsd-license.php)
+
+"""
+Modify the behavior of Lists in Python-Markdown to act in a sane manner.
+"""
+
+from __future__ import annotations
+
+import re
+import xml.etree.ElementTree as etree
+from typing import TYPE_CHECKING
+
+from markdown import Extension
+from markdown.blockparser import BlockParser
+from markdown.blockprocessors import (
+    ListIndentProcessor,
+    OListProcessor,
+    ParagraphProcessor
+)
+
+if TYPE_CHECKING:  # pragma: no cover
+    from markdown import blockparser
+
+
+# The min. number of added leading spaces needed to start a nested list
+MIN_NESTED_LIST_INDENT = 2
+assert MIN_NESTED_LIST_INDENT > 1, "'MIN_NESTED_LIST_INDENT' must be > 1"
+
+
+class SaneListIndentProcessor(ListIndentProcessor):
+    """ Process children of list items.
+
+    Example
+
+        * a list item
+            process this part
+
+            or this part
+
+    """
+
+    def __init__(self, *args):
+        super().__init__(*args)
+        self.INDENT_RE = re.compile(r'^(([ ])+)')
+
+    def test(self, parent: etree.Element, block: str) -> bool:
+        return block.startswith(' ' * MIN_NESTED_LIST_INDENT) and \
+            not self.parser.state.isstate('detabbed') and \
+            (parent.tag in self.ITEM_TYPES or
+                (len(parent) and parent[-1] is not None and
+                    (parent[-1].tag in self.LIST_TYPES)))
+
+    def get_level(self, parent: etree.Element, block: str) -> tuple[int, etree.Element]:
+        """ Get level of indentation based on list level. """
+        # Get indent level
+        m = self.INDENT_RE.match(block)
+        if m:
+            indent_level = len(m.group(1)) / MIN_NESTED_LIST_INDENT
+        else:
+            indent_level = 0
+        if self.parser.state.isstate('list'):
+            # We're in a tight-list - so we already are at correct parent.
+            level = 1
+        else:
+            # We're in a loose-list - so we need to find parent.
+            level = 0
+        # Step through children of tree to find matching indent level.
+        while indent_level > level:
+            child = self.lastChild(parent)
+            if (child is not None and
+                    (child.tag in self.LIST_TYPES or child.tag in self.ITEM_TYPES)):
+                if child.tag in self.LIST_TYPES:
+                    level += 1
+                parent = child
+            else:
+                # No more child levels. If we're short of `indent_level`,
+                # we have a code block. So we stop here.
+                break
+        return level, parent
+
+    def detab(self, text: str, length: int | None = None) -> tuple[str, str]:
+        """ Remove a tab from the front of each line of the given text. """
+        if length is None:
+            length = MIN_NESTED_LIST_INDENT
+        newtext = []
+        lines = text.split('\n')
+        for line in lines:
+            if line.startswith(' ' * length):
+                newtext.append(line[length:])
+            elif not line.strip():
+                newtext.append('')
+            else:
+                break
+        return '\n'.join(newtext), '\n'.join(lines[len(newtext):])
+
+    def looseDetab(self, text: str, level: int = 1) -> str:
+        """ Remove indentation from front of lines but allowing dedented lines. """
+        lines = text.split('\n')
+        for i in range(len(lines)):
+            if lines[i].startswith(' ' * MIN_NESTED_LIST_INDENT * level):
+                lines[i] = lines[i][MIN_NESTED_LIST_INDENT * level:]
+        return '\n'.join(lines)
+
+
+class SaneOListProcessor(OListProcessor):
+    """ Override `SIBLING_TAGS` to not include `ul` and set `LAZY_OL` to `False`. """
+
+    SIBLING_TAGS = ['ol']
+    """ Exclude `ul` from list of siblings. """
+    LAZY_OL = False
+    """ Disable lazy list behavior. """
+
+    def __init__(self, parser: blockparser.BlockParser):
+        super().__init__(parser)
+        # This restriction stems from the 'CodeBlockProcessor' class,
+        # which automatically matches blocks with an indent = self.tab_length
+        max_list_start_indent = self.tab_length - 1
+        # Detect an item (e.g., `1. item`)
+        self.RE = re.compile(r'^[ ]{0,%d}[\*_]{0,2}\d+\.[ ]+(.*)' % max_list_start_indent)
+        # Detect items on secondary lines. they can be of either list type.
+        self.CHILD_RE = re.compile(r'^[ ]{0,%d}([\*_]{0,2})((\d+\.))[ ]+(.*)' % (MIN_NESTED_LIST_INDENT - 1))
+        # Detect indented (nested) items of either type
+        self.INDENT_RE = re.compile(r'^[ ]{%d,%d}[\*_]{0,2}((\d+\.)|[*+-])[ ]+.*' %
+                                    (MIN_NESTED_LIST_INDENT, self.tab_length * 2 - 1))
+
+    def run(self, parent: etree.Element, blocks: list[str]) -> None:
+        # Check for multiple items in one block.
+        items = self.get_items(blocks.pop(0))
+        sibling = self.lastChild(parent)
+
+        if sibling is not None and sibling.tag in self.SIBLING_TAGS:
+            # Previous block was a list item, so set that as parent
+            lst = sibling
+            # make sure previous item is in a `p` - if the item has text,
+            # then it isn't in a `p`
+            if lst[-1].text:
+                # since it's possible there are other children for this
+                # sibling, we can't just `SubElement` the `p`, we need to
+                # insert it as the first item.
+                p = etree.Element('p')
+                p.text = lst[-1].text
+                lst[-1].text = ''
+                lst[-1].insert(0, p)
+            # if the last item has a tail, then the tail needs to be put in a `p`
+            # likely only when a header is not followed by a blank line
+            lch = self.lastChild(lst[-1])
+            if lch is not None and lch.tail:
+                p = etree.SubElement(lst[-1], 'p')
+                p.text = lch.tail.lstrip()
+                lch.tail = ''
+
+            # parse first block differently as it gets wrapped in a `p`.
+            li = etree.SubElement(lst, 'li')
+            self.parser.state.set('looselist')
+            firstitem = items.pop(0)
+            self.parser.parseBlocks(li, [firstitem])
+            self.parser.state.reset()
+        elif parent.tag in ['ol', 'ul']:
+            # this catches the edge case of a multi-item indented list whose
+            # first item is in a blank parent-list item:
+            #     * * subitem1
+            #         * subitem2
+            # see also `ListIndentProcessor`
+            lst = parent
+        else:
+            # This is a new list so create parent with appropriate tag.
+            lst = etree.SubElement(parent, self.TAG)
+            # Check if a custom start integer is set
+            if not self.LAZY_OL and self.STARTSWITH != '1':
+                lst.attrib['start'] = self.STARTSWITH
+
+        self.parser.state.set('list')
+        # Loop through items in block, recursively parsing each with the
+        # appropriate parent.
+        for item in items:
+            if item.startswith(" " * MIN_NESTED_LIST_INDENT):
+                # Item is indented. Parse with last item as parent
+                self.parser.parseBlocks(lst[-1], [item])
+            else:
+                # New item. Create `li` and parse with it as parent
+                li = etree.SubElement(lst, 'li')
+                self.parser.parseBlocks(li, [item])
+        self.parser.state.reset()
+
+    def looseDetab(self, text: str, indent_length: int, level: int = 1) -> str:
+        """ Remove indentation from front of lines but allowing dedented lines. """
+        lines = text.split('\n')
+        for i in range(len(lines)):
+            if lines[i].startswith(' ' * indent_length * level):
+                lines[i] = lines[i][indent_length * level:]
+        return '\n'.join(lines)
+
+    def get_items(self, block: str) -> list[str]:
+        """ Break a block into list items. """
+        # If first level of list is indented, remove that indentation
+        if (indent_len := len(block) - len(block.lstrip())) > 0:
+            block = self.looseDetab(block, indent_len)
+        items = []
+        for line in block.split('\n'):
+            m = self.CHILD_RE.match(line)
+            if m:
+                # This is a new list item
+                # Check first item for the start index
+                if not items:
+                    # Detect the integer value of first list item
+                    INTEGER_RE = re.compile(r'(\d+)')
+                    self.STARTSWITH = INTEGER_RE.match(m.group(2)).group()
+                # Append to the list
+                items.append(m.group(1) + m.group(4))
+            elif self.INDENT_RE.match(line):
+                # This is an indented (possibly nested) item.
+                if items[-1].startswith(' ' * MIN_NESTED_LIST_INDENT):
+                    # Previous item was indented. Append to that item.
+                    items[-1] = '{}\n{}'.format(items[-1], line)
+                else:
+                    items.append(line)
+            else:
+                # This is another line of previous item. Append to that item.
+                items[-1] = '{}\n{}'.format(items[-1], line)
+        return items
+
+
+class SaneUListProcessor(SaneOListProcessor):
+    """ Override `SIBLING_TAGS` to not include `ol`. """
+
+    TAG: str = 'ul'
+    SIBLING_TAGS = ['ul']
+    """ Exclude `ol` from list of siblings. """
+
+    def __init__(self, parser: blockparser.BlockParser):
+        super().__init__(parser)
+        # Detect an item (e.g., `- item` or `+ item` or `* item`).
+        max_list_start_indent = self.tab_length - 1
+        self.RE = re.compile(r'^[ ]{0,%d}[*+-][ ]+(.*)' % max_list_start_indent)
+        self.CHILD_RE = re.compile(r'^[ ]{0,%d}(([*+-]))[ ]+(.*)' % (MIN_NESTED_LIST_INDENT - 1))
+
+    def get_items(self, block: str) -> list[str]:
+        """ Break a block into list items. """
+        # If first level of list is indented, remove that indentation
+        if (indent_len := len(block) - len(block.lstrip())) > 0:
+            block = self.looseDetab(block, indent_len)
+        items = []
+        for line in block.split('\n'):
+            m = self.CHILD_RE.match(line)
+            if m:
+                # Append to the list
+                items.append(m.group(3))
+            elif self.INDENT_RE.match(line):
+                # This is an indented (possibly nested) item.
+                if items[-1].startswith(' ' * MIN_NESTED_LIST_INDENT):
+                    # Previous item was indented. Append to that item.
+                    items[-1] = '{}\n{}'.format(items[-1], line)
+                else:
+                    items.append(line)
+            else:
+                # This is another line of previous item. Append to that item.
+                items[-1] = '{}\n{}'.format(items[-1], line)
+        return items
+
+
+class SaneParagraphProcessor(ParagraphProcessor):
+    """ Process Paragraph blocks. """
+
+    def __init__(self, parser: BlockParser):
+        super().__init__(parser)
+        max_list_start_indent = self.tab_length - 1
+        self.LIST_RE = re.compile(r"\s{2}\n(\s{0,%d}[\d+*-])" % max_list_start_indent)
+
+    def run(self, parent: etree.Element, blocks: list[str]) -> None:
+        block = blocks.pop(0)
+        if block.strip():
+            # Not a blank block. Add to parent, otherwise throw it away.
+            if self.parser.state.isstate('list'):
+                # The parent is a tight-list.
+                #
+                # Check for any children. This will likely only happen in a
+                # tight-list when a header isn't followed by a blank line.
+                # For example:
+                #
+                #     * # Header
+                #     Line 2 of list item - not part of header.
+                sibling = self.lastChild(parent)
+                if sibling is not None:
+                    # Insert after sibling.
+                    if sibling.tail:
+                        sibling.tail = '{}\n{}'.format(sibling.tail, block)
+                    else:
+                        sibling.tail = '\n%s' % block
+                else:
+                    # Append to parent.text
+                    if parent.text:
+                        parent.text = '{}\n{}'.format(parent.text, block)
+                    else:
+                        parent.text = block.lstrip()
+            else:
+                # Check if paragraph contains a list
+                next_list_block = None
+                if list_match := self.LIST_RE.search(block):
+                    list_start = list_match.end() - len(list_match.group(1))
+                    next_list_block = block[list_start:]
+                    block = block[:list_start]
+
+                # Create a regular paragraph
+                p = etree.SubElement(parent, 'p')
+                p.text = block.lstrip()
+
+                # If a list was found, parse its block separately with the paragraph as the parent
+                if next_list_block:
+                    self.parser.parseBlocks(p, [next_list_block])
+
+
+class SaneListExtension(Extension):
+    """ Add sane lists to Markdown. """
+
+    def extendMarkdown(self, md):
+        """ Override existing Processors. """
+        md.parser.blockprocessors.register(SaneListIndentProcessor(md.parser), 'indent', 90)
+        md.parser.blockprocessors.register(SaneOListProcessor(md.parser), 'olist', 40)
+        md.parser.blockprocessors.register(SaneUListProcessor(md.parser), 'ulist', 30)
+        md.parser.blockprocessors.register(SaneParagraphProcessor(md.parser), 'paragraph', 10)
+
+
+def makeExtension(**kwargs):  # pragma: no cover
+    return SaneListExtension(**kwargs)
diff --git a/modules/shared.py b/modules/shared.py
index 17e1a3ee..9adaafd7 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -29,39 +29,40 @@ need_restart = False
 
 # UI defaults
 settings = {
-    'dark_theme': True,
     'show_controls': True,
     'start_with': '',
     'mode': 'chat-instruct',
     'chat_style': 'cai-chat',
+    'chat-instruct_command': 'Continue the chat dialogue below. Write a single reply for the character "<|character|>".\n\n<|prompt|>',
     'prompt-default': 'QA',
     'prompt-notebook': 'QA',
-    'preset': 'min_p',
-    'max_new_tokens': 512,
-    'max_new_tokens_min': 1,
-    'max_new_tokens_max': 4096,
-    'negative_prompt': '',
-    'seed': -1,
-    'truncation_length': 2048,
-    'max_tokens_second': 0,
-    'max_updates_second': 0,
-    'prompt_lookup_num_tokens': 0,
-    'custom_stopping_strings': '',
-    'custom_token_bans': '',
-    'auto_max_new_tokens': False,
-    'ban_eos_token': False,
-    'add_bos_token': True,
-    'skip_special_tokens': True,
-    'stream': True,
     'character': 'Assistant',
     'name1': 'You',
     'user_bio': '',
     'custom_system_message': '',
+    'preset': 'min_p',
+    'max_new_tokens': 512,
+    'max_new_tokens_min': 1,
+    'max_new_tokens_max': 4096,
+    'prompt_lookup_num_tokens': 0,
+    'max_tokens_second': 0,
+    'max_updates_second': 0,
+    'auto_max_new_tokens': True,
+    'ban_eos_token': False,
+    'add_bos_token': True,
+    'skip_special_tokens': True,
+    'stream': True,
+    'static_cache': False,
+    'truncation_length': 2048,
+    'seed': -1,
+    'custom_stopping_strings': '',
+    'custom_token_bans': '',
+    'negative_prompt': '',
+    'autoload_model': False,
+    'dark_theme': True,
+    'default_extensions': [],
     'instruction_template_str': "{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set ns.found = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{%- if not ns.found -%}\n    {{- '' + 'Below is an instruction that describes a task. Write a response that appropriately completes the request.' + '\\n\\n' -}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' -%}\n        {{- '' + message['content'] + '\\n\\n' -}}\n    {%- else -%}\n        {%- if message['role'] == 'user' -%}\n            {{-'### Instruction:\\n' + message['content'] + '\\n\\n'-}}\n        {%- else -%}\n            {{-'### Response:\\n' + message['content'] + '\\n\\n' -}}\n        {%- endif -%}\n    {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n    {{-'### Response:\\n'-}}\n{%- endif -%}",
     'chat_template_str': "{%- for message in messages %}\n    {%- if message['role'] == 'system' -%}\n        {%- if message['content'] -%}\n            {{- message['content'] + '\\n\\n' -}}\n        {%- endif -%}\n        {%- if user_bio -%}\n            {{- user_bio + '\\n\\n' -}}\n        {%- endif -%}\n    {%- else -%}\n        {%- if message['role'] == 'user' -%}\n            {{- name1 + ': ' + message['content'] + '\\n'-}}\n        {%- else -%}\n            {{- name2 + ': ' + message['content'] + '\\n' -}}\n        {%- endif -%}\n    {%- endif -%}\n{%- endfor -%}",
-    'chat-instruct_command': 'Continue the chat dialogue below. Write a single reply for the character "<|character|>".\n\n<|prompt|>',
-    'autoload_model': False,
-    'default_extensions': [],
 }
 
 default_settings = copy.deepcopy(settings)
@@ -81,12 +82,11 @@ group.add_argument('--model-menu', action='store_true', help='Show a model menu
 group.add_argument('--settings', type=str, help='Load the default interface settings from this yaml file. See settings-template.yaml for an example. If you create a file called settings.yaml, this file will be loaded by default without the need to use the --settings flag.')
 group.add_argument('--extensions', type=str, nargs='+', help='The list of extensions to load. If you want to load more than one extension, write the names separated by spaces.')
 group.add_argument('--verbose', action='store_true', help='Print the prompts to the terminal.')
-group.add_argument('--chat-buttons', action='store_true', help='Show buttons on the chat tab instead of a hover menu.')
 group.add_argument('--idle-timeout', type=int, default=0, help='Unload model after this many minutes of inactivity. It will be automatically reloaded when you try to use it again.')
 
 # Model loader
 group = parser.add_argument_group('Model loader')
-group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ.')
+group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, HQQ, TensorRT-LLM.')
 
 # Transformers/Accelerate
 group = parser.add_argument_group('Transformers/Accelerate')
@@ -104,6 +104,7 @@ group.add_argument('--force-safetensors', action='store_true', help='Set use_saf
 group.add_argument('--no_use_fast', action='store_true', help='Set use_fast=False while loading the tokenizer (it\'s True by default). Use this if you have any problems related to use_fast.')
 group.add_argument('--use_flash_attention_2', action='store_true', help='Set use_flash_attention_2=True while loading the model.')
 group.add_argument('--use_eager_attention', action='store_true', help='Set attn_implementation= eager while loading the model.')
+group.add_argument('--torch-compile', action='store_true', help='Compile the model with torch.compile for improved performance.')
 
 # bitsandbytes 4-bit
 group = parser.add_argument_group('bitsandbytes 4-bit')
@@ -115,7 +116,7 @@ group.add_argument('--quant_type', type=str, default='nf4', help='quant_type for
 # llama.cpp
 group = parser.add_argument_group('llama.cpp')
 group.add_argument('--flash-attn', action='store_true', help='Use flash-attention.')
-group.add_argument('--tensorcores', action='store_true', help='NVIDIA only: use llama-cpp-python compiled with tensor cores support. This may increase performance on newer cards.')
+group.add_argument('--tensorcores', action='store_true', help='NVIDIA only: use llama-cpp-python compiled without GGML_CUDA_FORCE_MMQ. This may improve performance on newer cards.')
 group.add_argument('--n_ctx', type=int, default=2048, help='Size of the prompt context.')
 group.add_argument('--threads', type=int, default=0, help='Number of threads to use.')
 group.add_argument('--threads-batch', type=int, default=0, help='Number of threads to use for batches/prompt processing.')
@@ -143,20 +144,8 @@ group.add_argument('--cfg-cache', action='store_true', help='ExLlamav2_HF: Creat
 group.add_argument('--no_flash_attn', action='store_true', help='Force flash-attention to not be used.')
 group.add_argument('--no_xformers', action='store_true', help='Force xformers to not be used.')
 group.add_argument('--no_sdpa', action='store_true', help='Force Torch SDPA to not be used.')
-group.add_argument('--cache_8bit', action='store_true', help='Use 8-bit cache to save VRAM.')
-group.add_argument('--cache_4bit', action='store_true', help='Use Q4 cache to save VRAM.')
 group.add_argument('--num_experts_per_token', type=int, default=2, help='Number of experts to use for generation. Applies to MoE models like Mixtral.')
-
-# AutoGPTQ
-group = parser.add_argument_group('AutoGPTQ')
-group.add_argument('--triton', action='store_true', help='Use triton.')
-group.add_argument('--no_inject_fused_mlp', action='store_true', help='Triton mode only: disable the use of fused MLP, which will use less VRAM at the cost of slower inference.')
-group.add_argument('--no_use_cuda_fp16', action='store_true', help='This can make models faster on some systems.')
-group.add_argument('--desc_act', action='store_true', help='For models that do not have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig.')
-group.add_argument('--disable_exllama', action='store_true', help='Disable ExLlama kernel, which can improve inference speed on some systems.')
-group.add_argument('--disable_exllamav2', action='store_true', help='Disable ExLlamav2 kernel.')
-group.add_argument('--wbits', type=int, default=0, help='Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.')
-group.add_argument('--groupsize', type=int, default=-1, help='Group size.')
+group.add_argument('--enable_tp', action='store_true', help='Enable Tensor Parallelism (TP) in ExLlamaV2.')
 
 # HQQ
 group = parser.add_argument_group('HQQ')
@@ -166,6 +155,10 @@ group.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='B
 group = parser.add_argument_group('TensorRT-LLM')
 group.add_argument('--cpp-runner', action='store_true', help='Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn\'t support streaming yet.')
 
+# Cache
+group = parser.add_argument_group('Cache')
+group.add_argument('--cache_type', type=str, default='fp16', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.')
+
 # DeepSpeed
 group = parser.add_argument_group('DeepSpeed')
 group.add_argument('--deepspeed', action='store_true', help='Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.')
@@ -190,6 +183,7 @@ group.add_argument('--gradio-auth-path', type=str, help='Set the Gradio authenti
 group.add_argument('--ssl-keyfile', type=str, help='The path to the SSL certificate key file.', default=None)
 group.add_argument('--ssl-certfile', type=str, help='The path to the SSL certificate cert file.', default=None)
 group.add_argument('--subpath', type=str, help='Customize the subpath for gradio, use with reverse proxy')
+group.add_argument('--old-colors', action='store_true', help='Use the legacy Gradio colors, before the December/2024 update.')
 
 # API
 group = parser.add_argument_group('API')
@@ -199,6 +193,8 @@ group.add_argument('--public-api-id', type=str, help='Tunnel ID for named Cloudf
 group.add_argument('--api-port', type=int, default=5000, help='The listening port for the API.')
 group.add_argument('--api-key', type=str, default='', help='API authentication key.')
 group.add_argument('--admin-key', type=str, default='', help='API authentication key for admin tasks like loading and unloading models. If not set, will be the same as --api-key.')
+group.add_argument('--api-enable-ipv6', action='store_true', help='Enable IPv6 for the API')
+group.add_argument('--api-disable-ipv4', action='store_true', help='Disable IPv4 for the API')
 group.add_argument('--nowebui', action='store_true', help='Do not launch the Gradio UI. Useful for launching the API in standalone mode.')
 group.add_argument('--model-selection-mode', type=int, default=0, help='Model selection mode: bitwise flag. 1=Include dummy models, 2=Include local models, 4=Return only the currently loaded model if local models are included.')
 
@@ -208,11 +204,17 @@ group.add_argument('--multimodal-pipeline', type=str, default=None, help='The mu
 
 # Deprecated parameters
 group = parser.add_argument_group('Deprecated')
-group.add_argument('--model_type', type=str, help='DEPRECATED')
-group.add_argument('--pre_layer', type=int, nargs='+', help='DEPRECATED')
-group.add_argument('--checkpoint', type=str, help='DEPRECATED')
-group.add_argument('--monkey-patch', action='store_true', help='DEPRECATED')
-group.add_argument('--no_inject_fused_attention', action='store_true', help='DEPRECATED')
+group.add_argument('--cache_4bit', action='store_true', help='DEPRECATED')
+group.add_argument('--cache_8bit', action='store_true', help='DEPRECATED')
+group.add_argument('--chat-buttons', action='store_true', help='DEPRECATED')
+group.add_argument('--triton', action='store_true', help='DEPRECATED')
+group.add_argument('--no_inject_fused_mlp', action='store_true', help='DEPRECATED')
+group.add_argument('--no_use_cuda_fp16', action='store_true', help='DEPRECATED')
+group.add_argument('--desc_act', action='store_true', help='DEPRECATED')
+group.add_argument('--disable_exllama', action='store_true', help='DEPRECATED')
+group.add_argument('--disable_exllamav2', action='store_true', help='DEPRECATED')
+group.add_argument('--wbits', type=int, default=0, help='DEPRECATED')
+group.add_argument('--groupsize', type=int, default=-1, help='DEPRECATED')
 
 args = parser.parse_args()
 args_defaults = parser.parse_args([])
@@ -222,14 +224,26 @@ for arg in sys.argv[1:]:
     if hasattr(args, arg):
         provided_arguments.append(arg)
 
-deprecated_args = []
+deprecated_args = [
+    'cache_4bit',
+    'cache_8bit',
+    'chat_buttons',
+    'triton',
+    'no_inject_fused_mlp',
+    'no_use_cuda_fp16',
+    'desc_act',
+    'disable_exllama',
+    'disable_exllamav2',
+    'wbits',
+    'groupsize'
+]
 
 
 def do_cmd_flags_warnings():
 
     # Deprecation warnings
     for k in deprecated_args:
-        if getattr(args, k):
+        if k in provided_arguments:
             logger.warning(f'The --{k} flag has been deprecated and will be removed soon. Please remove that flag.')
 
     # Security warnings
@@ -255,10 +269,6 @@ def fix_loader_name(name):
         return 'llamacpp_HF'
     elif name in ['transformers', 'huggingface', 'hf', 'hugging_face', 'hugging face']:
         return 'Transformers'
-    elif name in ['autogptq', 'auto-gptq', 'auto_gptq', 'auto gptq']:
-        return 'AutoGPTQ'
-    elif name in ['exllama', 'ex-llama', 'ex_llama', 'exlama']:
-        return 'ExLlama'
     elif name in ['exllamav2', 'exllama-v2', 'ex_llama-v2', 'exlamav2', 'exlama-v2', 'exllama2', 'exllama-2']:
         return 'ExLlamav2'
     elif name in ['exllamav2-hf', 'exllamav2_hf', 'exllama-v2-hf', 'exllama_v2_hf', 'exllama-v2_hf', 'exllama2-hf', 'exllama2_hf', 'exllama-2-hf', 'exllama_2_hf', 'exllama-2_hf']:
@@ -269,6 +279,58 @@ def fix_loader_name(name):
         return 'TensorRT-LLM'
 
 
+def transform_legacy_kv_cache_options(opts):
+    # Handle both argparse.Namespace and dict here
+    def get(key):
+        return opts.get(key) if isinstance(opts, dict) else getattr(opts, key, None)
+
+    def set(key, value):
+        if isinstance(opts, dict):
+            opts[key] = value
+        else:
+            setattr(opts, key, value)
+
+    def del_key(key, fallback_set):
+        # only remove from user dict, can't delete from argparse.Namespace
+        if type(opts) is dict:
+            if key in opts:
+                del opts[key]
+        else:
+            setattr(opts, key, fallback_set)
+
+    # Retrieve values
+    loader = get('loader')
+    cache_8bit = get('cache_8bit')
+    cache_4bit = get('cache_4bit')
+
+    # Determine cache type based on loader or legacy flags
+    if cache_8bit or cache_4bit:
+        if not loader:
+            # Legacy behavior: prefer 8-bit over 4-bit to minimize breakage
+            if cache_8bit:
+                set('cache_type', 'fp8')
+            elif cache_4bit:
+                set('cache_type', 'q4')
+        elif loader.lower() in ['exllamav2', 'exllamav2_hf']:
+            # ExLlamaV2 loader-specific cache type
+            if cache_8bit:
+                set('cache_type', 'fp8')
+            elif cache_4bit:
+                set('cache_type', 'q4')
+        elif loader.lower() in ['llama.cpp', 'llamacpp_hf']:
+            # Llama.cpp loader-specific cache type
+            if cache_4bit:
+                set('cache_type', 'q4_0')
+            elif cache_8bit:
+                set('cache_type', 'q8_0')
+
+    # Clean up legacy keys
+    del_key('cache_4bit', False)
+    del_key('cache_8bit', False)
+
+    return opts
+
+
 def add_extension(name, last=False):
     if args.extensions is None:
         args.extensions = [name]
@@ -297,10 +359,14 @@ def load_user_config():
     else:
         user_config = {}
 
+    for model_name in user_config:
+        user_config[model_name] = transform_legacy_kv_cache_options(user_config[model_name])
+
     return user_config
 
 
 args.loader = fix_loader_name(args.loader)
+args = transform_legacy_kv_cache_options(args)
 
 # Activate the multimodal extension
 if args.multimodal_pipeline is not None:
diff --git a/modules/text_generation.py b/modules/text_generation.py
index 75e5ef36..152b2b8d 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -16,7 +16,7 @@ from transformers import (
 )
 
 import modules.shared as shared
-from modules import models
+from modules import models, sampler_hijack
 from modules.cache_utils import process_llamacpp_cache
 from modules.callbacks import (
     Iteratorize,
@@ -28,7 +28,9 @@ from modules.grammar.grammar_utils import initialize_grammar
 from modules.grammar.logits_process import GrammarConstrainedLogitsProcessor
 from modules.html_generator import generate_basic_html
 from modules.logging_colors import logger
-from modules.models import clear_torch_cache, load_model
+from modules.models import clear_torch_cache, get_device, load_model
+
+sampler_hijack.hijack_samplers()
 
 
 def generate_reply(*args, **kwargs):
@@ -79,7 +81,6 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
             all_stop_strings += st
 
     shared.stop_everything = False
-    clear_torch_cache()
     seed = set_manual_seed(state['seed'])
     last_update = -1
     reply = ''
@@ -160,18 +161,12 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
 
     if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'TensorRTLLMModel'] or shared.args.cpu:
         return input_ids
-    elif shared.args.deepspeed:
-        import deepspeed
-        return input_ids.to(deepspeed.get_accelerator().current_device_name())
-    elif torch.backends.mps.is_available():
-        device = torch.device('mps')
-        return input_ids.to(device)
-    elif is_torch_xpu_available():
-        return input_ids.to("xpu:0")
-    elif is_torch_npu_available():
-        return input_ids.to("npu:0")
     else:
-        return input_ids.cuda()
+        device = get_device()
+        if device:
+            return input_ids.to(device)
+
+        return input_ids
 
 
 def decode(output_ids, skip_special_tokens=True):
@@ -274,7 +269,12 @@ def get_reply_from_output_ids(output_ids, state=None, starting_from=0):
     if (hasattr(shared.tokenizer, 'convert_ids_to_tokens') and len(output_ids) > starting_from) and not reply.startswith(' '):
         first_token = shared.tokenizer.convert_ids_to_tokens(int(output_ids[starting_from]))
         if isinstance(first_token, (bytes,)):
-            first_token = first_token.decode('utf8')
+            # try to decode the bytes to a string
+            # if it fails, which means it's not a string in this turn, just ignore it
+            try:
+                first_token = first_token.decode('utf8')
+            except UnicodeDecodeError:
+                first_token = ''
 
         if first_token.startswith('▁'):
             reply = ' ' + reply
@@ -283,29 +283,66 @@ def get_reply_from_output_ids(output_ids, state=None, starting_from=0):
 
 
 def generate_reply_HF(question, original_question, seed, state, stopping_strings=None, is_chat=False):
+    if shared.args.loader == 'Transformers':
+        clear_torch_cache()
+
     generate_params = {}
-    for k in ['max_new_tokens', 'temperature', 'temperature_last', 'dynamic_temperature', 'dynatemp_low', 'dynatemp_high', 'dynatemp_exponent', 'smoothing_factor', 'smoothing_curve', 'top_p', 'min_p', 'top_k', 'repetition_penalty', 'presence_penalty', 'frequency_penalty', 'repetition_penalty_range', 'typical_p', 'tfs', 'top_a', 'guidance_scale', 'penalty_alpha', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'do_sample', 'encoder_repetition_penalty', 'no_repeat_ngram_size', 'dry_multiplier', 'dry_base', 'dry_allowed_length', 'dry_sequence_breakers']:
+    for k in [
+        'temperature',
+        'dynatemp_low',
+        'dynatemp_high',
+        'dynatemp_exponent',
+        'smoothing_factor',
+        'smoothing_curve',
+        'min_p',
+        'top_p',
+        'top_k',
+        'typical_p',
+        'xtc_threshold',
+        'xtc_probability',
+        'tfs',
+        'top_a',
+        'dry_multiplier',
+        'dry_allowed_length',
+        'dry_base',
+        'repetition_penalty',
+        'frequency_penalty',
+        'presence_penalty',
+        'encoder_repetition_penalty',
+        'no_repeat_ngram_size',
+        'repetition_penalty_range',
+        'penalty_alpha',
+        'guidance_scale',
+        'mirostat_mode',
+        'mirostat_tau',
+        'mirostat_eta',
+        'max_new_tokens',
+        'do_sample',
+        'dynamic_temperature',
+        'temperature_last',
+        'dry_sequence_breakers',
+    ]:
         if k in state:
             generate_params[k] = state[k]
 
-    if isinstance(state['sampler_priority'], list) and len(state['sampler_priority']) > 0:
-        generate_params['sampler_priority'] = state['sampler_priority']
-    elif isinstance(state['sampler_priority'], str) and state['sampler_priority'].strip() != '':
-        generate_params['sampler_priority'] = [x.strip() for x in state['sampler_priority'].replace('\n', ',').split(',') if x.strip()]
-
-    if state['negative_prompt'] != '':
-        generate_params['negative_prompt_ids'] = encode(state['negative_prompt'])
-
-    if state['prompt_lookup_num_tokens'] > 0:
-        generate_params['prompt_lookup_num_tokens'] = state['prompt_lookup_num_tokens']
-
     for k in ['epsilon_cutoff', 'eta_cutoff']:
         if state[k] > 0:
             generate_params[k] = state[k] * 1e-4
 
+    if state['prompt_lookup_num_tokens'] > 0:
+        generate_params['prompt_lookup_num_tokens'] = state['prompt_lookup_num_tokens']
+
     if state['ban_eos_token']:
         generate_params['suppress_tokens'] = [shared.tokenizer.eos_token_id]
 
+    if state['static_cache']:
+        generate_params['cache_implementation'] = 'static'
+
+    if isinstance(state['sampler_priority'], list) and len(state['sampler_priority']) > 0:
+        generate_params['sampler_priority'] = state['sampler_priority']
+    elif isinstance(state['sampler_priority'], str) and state['sampler_priority'].strip() != '':
+        generate_params['sampler_priority'] = [x.strip() for x in state['sampler_priority'].replace('\n', ',').split(',') if x.strip()]
+
     if state['custom_token_bans']:
         to_ban = [int(x) for x in state['custom_token_bans'].split(',')]
         if len(to_ban) > 0:
@@ -314,6 +351,9 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings
             else:
                 generate_params['suppress_tokens'] = to_ban
 
+    if state['negative_prompt'] != '':
+        generate_params['negative_prompt_ids'] = encode(state['negative_prompt'])
+
     generate_params.update({'use_cache': not shared.args.no_cache})
     if shared.args.deepspeed:
         generate_params.update({'synced_gpus': True})
@@ -321,7 +361,6 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings
     # Encode the input
     input_ids = encode(question, add_bos_token=state['add_bos_token'], truncation_length=get_max_prompt_length(state))
     output = input_ids[0]
-    cuda = not any((shared.args.cpu, shared.args.deepspeed))
     if state['auto_max_new_tokens']:
         generate_params['max_new_tokens'] = state['truncation_length'] - input_ids.shape[-1]
 
@@ -376,8 +415,9 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings
         if not state['stream']:
             with torch.no_grad():
                 output = shared.model.generate(**generate_params)[0]
-                if cuda:
-                    output = output.cuda()
+                device = get_device()
+                if device:
+                    output = output.to(device)
 
             starting_from = 0 if shared.is_seq2seq else len(input_ids[0])
             yield get_reply_from_output_ids(output, state, starting_from=starting_from)
@@ -388,7 +428,6 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings
 
             def generate_with_callback(callback=None, *args, **kwargs):
                 kwargs['stopping_criteria'].append(Stream(callback_func=callback))
-                clear_torch_cache()
                 with torch.no_grad():
                     shared.model.generate(**kwargs)
 
diff --git a/modules/training.py b/modules/training.py
index b003fc8c..11c4b8c5 100644
--- a/modules/training.py
+++ b/modules/training.py
@@ -18,14 +18,6 @@ import gradio as gr
 import torch
 import transformers
 from datasets import Dataset, load_dataset
-from peft import (
-    LoraConfig,
-    get_peft_model,
-    prepare_model_for_kbit_training,
-    set_peft_model_state_dict
-)
-from peft.utils.other import \
-    TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING as model_to_lora_modules
 from transformers import is_torch_xpu_available
 from transformers.models.auto.modeling_auto import (
     MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
@@ -292,6 +284,16 @@ def calc_trainable_parameters(model):
 
 def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en: bool, k_proj_en: bool, o_proj_en: bool, gate_proj_en: bool, down_proj_en: bool, up_proj_en: bool, save_steps: int, micro_batch_size: int, batch_size: int, epochs: int, learning_rate: str, lr_scheduler_type: str, lora_rank: int, lora_alpha: int, lora_dropout: float, cutoff_len: int, dataset: str, eval_dataset: str, format: str, eval_steps: int, raw_text_file: str, overlap_len: int, newline_favor_len: int, higher_rank_limit: bool, warmup_steps: int, optimizer: str, hard_cut_string: str, train_only_after: str, stop_at_loss: float, add_eos_token: bool, min_chars: int, report_to: str):
 
+    from peft import (
+        LoraConfig,
+        get_peft_model,
+        prepare_model_for_kbit_training,
+        set_peft_model_state_dict
+    )
+    from peft.utils.other import \
+        TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING as \
+        model_to_lora_modules
+
     global WANT_INTERRUPT
     WANT_INTERRUPT = False
 
diff --git a/modules/ui.py b/modules/ui.py
index 47f92cf0..df948a14 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -19,6 +19,8 @@ with open(Path(__file__).resolve().parent / '../css/highlightjs/highlightjs-copy
     css += f.read()
 with open(Path(__file__).resolve().parent / '../js/main.js', 'r') as f:
     js = f.read()
+with open(Path(__file__).resolve().parent / '../js/global_scope_js.js', 'r') as f:
+    global_scope_js = f.read()
 with open(Path(__file__).resolve().parent / '../js/save_files.js', 'r') as f:
     save_files_js = f.read()
 with open(Path(__file__).resolve().parent / '../js/switch_tabs.js', 'r') as f:
@@ -50,6 +52,50 @@ theme = gr.themes.Default(
     button_secondary_border_color="var(--border-color-primary)"
 )
 
+if not shared.args.old_colors:
+    theme = theme.set(
+        # General Colors
+        border_color_primary='#c5c5d2',
+        body_text_color_subdued='#484848',
+        background_fill_secondary='#eaeaea',
+        background_fill_secondary_dark='var(--selected-item-color-dark)',
+        background_fill_primary='var(--neutral-50)',
+        background_fill_primary_dark='var(--darker-gray)',
+        body_background_fill="white",
+        block_background_fill="transparent",
+        body_text_color="#333",
+        button_secondary_background_fill="#f4f4f4",
+        button_secondary_border_color="var(--border-color-primary)",
+
+        # Dark Mode Colors
+        input_background_fill_dark='var(--darker-gray)',
+        checkbox_background_color_dark='var(--darker-gray)',
+        block_background_fill_dark='transparent',
+        block_border_color_dark='transparent',
+        input_border_color_dark='var(--border-color-dark)',
+        checkbox_border_color_dark='var(--border-color-dark)',
+        border_color_primary_dark='var(--border-color-dark)',
+        button_secondary_border_color_dark='var(--border-color-dark)',
+        body_background_fill_dark='var(--dark-gray)',
+        button_primary_background_fill_dark='transparent',
+        button_secondary_background_fill_dark='transparent',
+        checkbox_label_background_fill_dark='transparent',
+        button_cancel_background_fill_dark='transparent',
+        button_secondary_background_fill_hover_dark='var(--selected-item-color-dark)',
+        checkbox_label_background_fill_hover_dark='var(--selected-item-color-dark)',
+        table_even_background_fill_dark='var(--darker-gray)',
+        table_odd_background_fill_dark='var(--selected-item-color-dark)',
+        code_background_fill_dark='var(--darker-gray)',
+
+        # Shadows and Radius
+        checkbox_label_shadow='none',
+        block_shadow='none',
+        block_shadow_dark='none',
+        button_large_radius='0.375rem',
+        button_large_padding='6px 12px',
+        input_radius='0.375rem',
+    )
+
 if Path("notification.mp3").exists():
     audio_notification_js = "document.querySelector('#audio_notification audio')?.play();"
 else:
@@ -58,62 +104,55 @@ else:
 
 def list_model_elements():
     elements = [
-        'loader',
         'filter_by_loader',
+        'loader',
         'cpu_memory',
-        'auto_devices',
-        'disk',
-        'cpu',
-        'bf16',
-        'load_in_8bit',
-        'trust_remote_code',
-        'no_use_fast',
-        'use_flash_attention_2',
-        'use_eager_attention',
-        'load_in_4bit',
-        'compute_dtype',
-        'quant_type',
-        'use_double_quant',
-        'wbits',
-        'groupsize',
-        'triton',
-        'desc_act',
-        'no_inject_fused_mlp',
-        'no_use_cuda_fp16',
-        'disable_exllama',
-        'disable_exllamav2',
-        'cfg_cache',
-        'no_flash_attn',
-        'no_xformers',
-        'no_sdpa',
-        'num_experts_per_token',
-        'cache_8bit',
-        'cache_4bit',
-        'autosplit',
+        'n_gpu_layers',
         'threads',
         'threads_batch',
         'n_batch',
-        'no_mmap',
-        'mlock',
-        'no_mul_mat_q',
-        'n_gpu_layers',
-        'tensor_split',
+        'hqq_backend',
         'n_ctx',
-        'gpu_split',
         'max_seq_len',
-        'compress_pos_emb',
+        'cache_type',
+        'tensor_split',
+        'gpu_split',
         'alpha_value',
         'rope_freq_base',
-        'numa',
-        'logits_all',
-        'no_offload_kqv',
-        'row_split',
-        'tensorcores',
-        'flash_attn',
-        'streaming_llm',
+        'compress_pos_emb',
+        'compute_dtype',
+        'quant_type',
         'attention_sink_size',
-        'hqq_backend',
+        'num_experts_per_token',
+        'tensorcores',
+        'load_in_8bit',
+        'load_in_4bit',
+        'torch_compile',
+        'flash_attn',
+        'use_flash_attention_2',
+        'streaming_llm',
+        'auto_devices',
+        'cpu',
+        'disk',
+        'row_split',
+        'no_offload_kqv',
+        'no_mul_mat_q',
+        'no_mmap',
+        'mlock',
+        'numa',
+        'use_double_quant',
+        'use_eager_attention',
+        'bf16',
+        'autosplit',
+        'enable_tp',
+        'no_flash_attn',
+        'no_xformers',
+        'no_sdpa',
+        'cfg_cache',
         'cpp_runner',
+        'logits_all',
+        'trust_remote_code',
+        'no_use_fast',
     ]
 
     if is_torch_xpu_available():
@@ -128,83 +167,87 @@ def list_model_elements():
 
 def list_interface_input_elements():
     elements = [
-        'max_new_tokens',
-        'auto_max_new_tokens',
-        'max_tokens_second',
-        'max_updates_second',
-        'prompt_lookup_num_tokens',
-        'seed',
         'temperature',
-        'temperature_last',
-        'dynamic_temperature',
         'dynatemp_low',
         'dynatemp_high',
         'dynatemp_exponent',
         'smoothing_factor',
         'smoothing_curve',
-        'top_p',
         'min_p',
+        'top_p',
         'top_k',
         'typical_p',
+        'xtc_threshold',
+        'xtc_probability',
         'epsilon_cutoff',
         'eta_cutoff',
+        'tfs',
+        'top_a',
+        'dry_multiplier',
+        'dry_allowed_length',
+        'dry_base',
         'repetition_penalty',
-        'presence_penalty',
         'frequency_penalty',
-        'repetition_penalty_range',
+        'presence_penalty',
         'encoder_repetition_penalty',
         'no_repeat_ngram_size',
-        'dry_multiplier',
-        'dry_base',
-        'dry_allowed_length',
-        'dry_sequence_breakers',
-        'do_sample',
+        'repetition_penalty_range',
         'penalty_alpha',
+        'guidance_scale',
         'mirostat_mode',
         'mirostat_tau',
         'mirostat_eta',
-        'grammar_string',
-        'negative_prompt',
-        'guidance_scale',
-        'add_bos_token',
+        'max_new_tokens',
+        'prompt_lookup_num_tokens',
+        'max_tokens_second',
+        'max_updates_second',
+        'do_sample',
+        'dynamic_temperature',
+        'temperature_last',
+        'auto_max_new_tokens',
         'ban_eos_token',
-        'custom_token_bans',
-        'sampler_priority',
-        'truncation_length',
-        'custom_stopping_strings',
+        'add_bos_token',
         'skip_special_tokens',
         'stream',
-        'tfs',
-        'top_a',
+        'static_cache',
+        'truncation_length',
+        'seed',
+        'sampler_priority',
+        'custom_stopping_strings',
+        'custom_token_bans',
+        'negative_prompt',
+        'dry_sequence_breakers',
+        'grammar_string',
     ]
 
     # Chat elements
     elements += [
+        'history',
+        'search_chat',
+        'unique_id',
         'textbox',
         'start_with',
+        'mode',
+        'chat_style',
+        'chat-instruct_command',
         'character_menu',
-        'history',
-        'unique_id',
+        'name2',
+        'context',
+        'greeting',
         'name1',
         'user_bio',
-        'name2',
-        'greeting',
-        'context',
-        'mode',
         'custom_system_message',
         'instruction_template_str',
         'chat_template_str',
-        'chat_style',
-        'chat-instruct_command',
     ]
 
     # Notebook/default elements
     elements += [
-        'textbox-notebook',
         'textbox-default',
-        'output_textbox',
+        'textbox-notebook',
         'prompt_menu-default',
         'prompt_menu-notebook',
+        'output_textbox',
     ]
 
     # Model elements
@@ -229,10 +272,10 @@ def gather_interface_values(*args):
 def apply_interface_values(state, use_persistent=False):
     if use_persistent:
         state = shared.persistent_interface_state
-        if 'textbox-default' in state:
+        if 'textbox-default' in state and 'prompt_menu-default' in state:
             state.pop('prompt_menu-default')
 
-        if 'textbox-notebook' in state:
+        if 'textbox-notebook' and 'prompt_menu-notebook' in state:
             state.pop('prompt_menu-notebook')
 
     elements = list_interface_input_elements()
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 57143cd8..395300d0 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -12,20 +12,42 @@ from modules.utils import gradio
 
 inputs = ('Chat input', 'interface_state')
 reload_arr = ('history', 'name1', 'name2', 'mode', 'chat_style', 'character_menu')
-clear_arr = ('delete_chat-confirm', 'delete_chat', 'delete_chat-cancel')
 
 
 def create_ui():
     mu = shared.args.multi_user
 
     shared.gradio['Chat input'] = gr.State()
-    shared.gradio['history'] = gr.JSON({'internal': [], 'visible': []}, visible=False)
+    shared.gradio['history'] = gr.JSON(visible=False)
+
+    with gr.Tab('Chat', id='Chat', elem_id='chat-tab'):
+        with gr.Row(elem_id='past-chats-row', elem_classes=['pretty_scrollbar']):
+            with gr.Column():
+                with gr.Row(elem_id='past-chats-buttons'):
+                    shared.gradio['branch_chat'] = gr.Button('Branch', elem_classes='refresh-button', interactive=not mu)
+                    shared.gradio['rename_chat'] = gr.Button('Rename', elem_classes='refresh-button', interactive=not mu)
+                    shared.gradio['delete_chat'] = gr.Button('🗑️', elem_classes='refresh-button', interactive=not mu)
+                    shared.gradio['Start new chat'] = gr.Button('New chat', elem_classes=['refresh-button', 'focus-on-chat-input'])
+
+                shared.gradio['search_chat'] = gr.Textbox(placeholder='Search chats...', max_lines=1, elem_id='search_chat')
+
+                with gr.Row(elem_id='delete-chat-row', visible=False) as shared.gradio['delete-chat-row']:
+                    shared.gradio['delete_chat-cancel'] = gr.Button('Cancel', elem_classes=['refresh-button', 'focus-on-chat-input'])
+                    shared.gradio['delete_chat-confirm'] = gr.Button('Confirm', variant='stop', elem_classes=['refresh-button', 'focus-on-chat-input'])
+
+                with gr.Row(elem_id='rename-row', visible=False) as shared.gradio['rename-row']:
+                    shared.gradio['rename_to'] = gr.Textbox(label='Rename to:', placeholder='New name', elem_classes=['no-background'])
+                    with gr.Row():
+                        shared.gradio['rename_to-cancel'] = gr.Button('Cancel', elem_classes=['refresh-button', 'focus-on-chat-input'])
+                        shared.gradio['rename_to-confirm'] = gr.Button('Confirm', elem_classes=['refresh-button', 'focus-on-chat-input'], variant='primary')
+
+                with gr.Row():
+                    shared.gradio['unique_id'] = gr.Radio(label="", elem_classes=['slim-dropdown', 'pretty_scrollbar'], interactive=not mu, elem_id='past-chats')
 
-    with gr.Tab('Chat', elem_id='chat-tab', elem_classes=("old-ui" if shared.args.chat_buttons else None)):
         with gr.Row():
             with gr.Column(elem_id='chat-col'):
-                shared.gradio['display'] = gr.HTML(value=chat_html_wrapper({'internal': [], 'visible': []}, '', '', 'chat', 'cai-chat', ''))
-
+                shared.gradio['html_display'] = gr.HTML(value=chat_html_wrapper({'internal': [], 'visible': []}, '', '', 'chat', 'cai-chat', ''), visible=True)
+                shared.gradio['display'] = gr.Textbox(value="", visible=False)  # Hidden buffer
                 with gr.Row(elem_id="chat-input-row"):
                     with gr.Column(scale=1, elem_id='gr-hover-container'):
                         gr.HTML(value='<div class="hover-element" onclick="void(0)"><span style="width: 100px; display: block" id="hover-element-button">&#9776;</span><div class="hover-menu" id="hover-menu"></div>', elem_id='gr-hover')
@@ -60,25 +82,6 @@ def create_ui():
                 shared.gradio['send-chat-to-default'] = gr.Button('Send to default')
                 shared.gradio['send-chat-to-notebook'] = gr.Button('Send to notebook')
 
-        with gr.Row(elem_id='past-chats-row', elem_classes=['pretty_scrollbar']):
-            with gr.Column():
-                with gr.Row():
-                    shared.gradio['rename_chat'] = gr.Button('Rename', elem_classes='refresh-button', interactive=not mu)
-                    shared.gradio['delete_chat'] = gr.Button('🗑️', elem_classes='refresh-button', interactive=not mu)
-                    shared.gradio['delete_chat-confirm'] = gr.Button('Confirm', variant='stop', visible=False, elem_classes=['refresh-button', 'focus-on-chat-input'])
-                    shared.gradio['delete_chat-cancel'] = gr.Button('Cancel', visible=False, elem_classes=['refresh-button', 'focus-on-chat-input'])
-                    shared.gradio['Start new chat'] = gr.Button('New chat', elem_classes=['refresh-button', 'focus-on-chat-input'])
-
-                with gr.Row(elem_id='rename-row'):
-                    shared.gradio['rename_to'] = gr.Textbox(label='Rename to:', placeholder='New name', visible=False, elem_classes=['no-background'])
-                    with gr.Row():
-                        shared.gradio['rename_to-confirm'] = gr.Button('Confirm', visible=False, elem_classes=['refresh-button', 'focus-on-chat-input'])
-                        shared.gradio['rename_to-cancel'] = gr.Button('Cancel', visible=False, elem_classes=['refresh-button', 'focus-on-chat-input'])
-
-                gr.Markdown("Past chats")
-                with gr.Row():
-                    shared.gradio['unique_id'] = gr.Radio(label="", elem_classes=['slim-dropdown', 'pretty_scrollbar'], interactive=not mu, elem_id='past-chats')
-
         with gr.Row(elem_id='chat-controls', elem_classes=['pretty_scrollbar']):
             with gr.Column():
                 with gr.Row():
@@ -161,7 +164,7 @@ def create_chat_settings_ui():
         with gr.Row():
             with gr.Column():
                 shared.gradio['custom_system_message'] = gr.Textbox(value=shared.settings['custom_system_message'], lines=2, label='Custom system message', info='If not empty, will be used instead of the default one.', elem_classes=['add_scrollbar'])
-                shared.gradio['instruction_template_str'] = gr.Textbox(value='', label='Instruction template', lines=24, info='Change this according to the model/LoRA that you are using. Used in instruct and chat-instruct modes.', elem_classes=['add_scrollbar', 'monospace'])
+                shared.gradio['instruction_template_str'] = gr.Textbox(value='', label='Instruction template', lines=24, info='This gets autodetected; you usually don\'t need to change it. Used in instruct and chat-instruct modes.', elem_classes=['add_scrollbar', 'monospace'])
                 with gr.Row():
                     shared.gradio['send_instruction_to_default'] = gr.Button('Send to default', elem_classes=['small-button'])
                     shared.gradio['send_instruction_to_notebook'] = gr.Button('Send to notebook', elem_classes=['small-button'])
@@ -177,32 +180,45 @@ def create_event_handlers():
     shared.input_params = gradio(inputs)
     shared.reload_inputs = gradio(reload_arr)
 
+    # Morph HTML updates instead of updating everything
+    shared.gradio['display'].change(None, gradio('display'), None, js="(text) => handleMorphdomUpdate(text)")
+
     shared.gradio['Generate'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         lambda x: (x, ''), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then(
+        lambda: None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.add("_generating")').then(
         chat.generate_chat_reply_wrapper, gradio(inputs), gradio('display', 'history'), show_progress=False).then(
+        None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.remove("_generating")').then(
         None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['textbox'].submit(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         lambda x: (x, ''), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then(
+        lambda: None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.add("_generating")').then(
         chat.generate_chat_reply_wrapper, gradio(inputs), gradio('display', 'history'), show_progress=False).then(
+        None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.remove("_generating")').then(
         None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['Regenerate'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda: None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.add("_generating")').then(
         partial(chat.generate_chat_reply_wrapper, regenerate=True), gradio(inputs), gradio('display', 'history'), show_progress=False).then(
+        None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.remove("_generating")').then(
         None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['Continue'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda: None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.add("_generating")').then(
         partial(chat.generate_chat_reply_wrapper, _continue=True), gradio(inputs), gradio('display', 'history'), show_progress=False).then(
+        None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.remove("_generating")').then(
         None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['Impersonate'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         lambda x: x, gradio('textbox'), gradio('Chat input'), show_progress=False).then(
+        lambda: None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.add("_generating")').then(
         chat.impersonate_wrapper, gradio(inputs), gradio('textbox', 'display'), show_progress=False).then(
+        None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.remove("_generating")').then(
         None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['Replace last reply'].click(
@@ -234,21 +250,29 @@ def create_event_handlers():
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.handle_start_new_chat_click, gradio('interface_state'), gradio('history', 'display', 'unique_id'), show_progress=False)
 
-    shared.gradio['delete_chat'].click(lambda: [gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)], None, gradio(clear_arr))
-    shared.gradio['delete_chat-cancel'].click(lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, gradio(clear_arr))
+    shared.gradio['delete_chat'].click(lambda: gr.update(visible=True), None, gradio('delete-chat-row'))
+    shared.gradio['delete_chat-cancel'].click(lambda: gr.update(visible=False), None, gradio('delete-chat-row'))
     shared.gradio['delete_chat-confirm'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        chat.handle_delete_chat_confirm_click, gradio('interface_state'), gradio('history', 'display', 'unique_id') + gradio(clear_arr), show_progress=False)
+        chat.handle_delete_chat_confirm_click, gradio('interface_state'), gradio('history', 'display', 'unique_id', 'delete-chat-row'), show_progress=False)
 
-    shared.gradio['rename_chat'].click(chat.handle_rename_chat_click, None, gradio('rename_to', 'rename_to-confirm', 'rename_to-cancel'), show_progress=False)
-    shared.gradio['rename_to-cancel'].click(lambda: [gr.update(visible=False)] * 3, None, gradio('rename_to', 'rename_to-confirm', 'rename_to-cancel'), show_progress=False)
+    shared.gradio['branch_chat'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        chat.handle_branch_chat_click, gradio('interface_state'), gradio('history', 'display', 'unique_id'), show_progress=False)
+
+    shared.gradio['rename_chat'].click(chat.handle_rename_chat_click, None, gradio('rename_to', 'rename-row'), show_progress=False)
+    shared.gradio['rename_to-cancel'].click(lambda: gr.update(visible=False), None, gradio('rename-row'), show_progress=False)
     shared.gradio['rename_to-confirm'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        chat.handle_rename_chat_confirm, gradio('rename_to', 'interface_state'), gradio('unique_id', 'rename_to', 'rename_to-confirm', 'rename_to-cancel'), show_progress=False)
+        chat.handle_rename_chat_confirm, gradio('rename_to', 'interface_state'), gradio('unique_id', 'rename-row'))
 
     shared.gradio['rename_to'].submit(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        chat.handle_rename_chat_confirm, gradio('rename_to', 'interface_state'), gradio('unique_id', 'rename_to', 'rename_to-confirm', 'rename_to-cancel'), show_progress=False)
+        chat.handle_rename_chat_confirm, gradio('rename_to', 'interface_state'), gradio('unique_id', 'rename-row'), show_progress=False)
+
+    shared.gradio['search_chat'].change(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        chat.handle_search_chat_change, gradio('interface_state'), gradio('unique_id'), show_progress=False)
 
     shared.gradio['load_chat_history'].upload(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
diff --git a/modules/ui_default.py b/modules/ui_default.py
index 112acd23..ccae9a5e 100644
--- a/modules/ui_default.py
+++ b/modules/ui_default.py
@@ -20,12 +20,12 @@ def create_ui():
             with gr.Column():
                 with gr.Row():
                     shared.gradio['textbox-default'] = gr.Textbox(value='', lines=27, label='Input', elem_classes=['textbox_default', 'add_scrollbar'])
-                    shared.gradio['token-counter-default'] = gr.HTML(value="<span>0</span>", elem_classes=["token-counter", "default-token-counter"])
+                    shared.gradio['token-counter-default'] = gr.HTML(value="<span>0</span>", elem_id="default-token-counter")
 
                 with gr.Row():
-                    shared.gradio['Generate-default'] = gr.Button('Generate', variant='primary')
-                    shared.gradio['Stop-default'] = gr.Button('Stop', elem_id='stop')
                     shared.gradio['Continue-default'] = gr.Button('Continue')
+                    shared.gradio['Stop-default'] = gr.Button('Stop', elem_id='stop', visible=False)
+                    shared.gradio['Generate-default'] = gr.Button('Generate', variant='primary')
 
                 with gr.Row():
                     shared.gradio['prompt_menu-default'] = gr.Dropdown(choices=utils.get_available_prompts(), value='None', label='Prompt', elem_classes='slim-dropdown')
@@ -63,20 +63,26 @@ def create_ui():
 def create_event_handlers():
     shared.gradio['Generate-default'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda: [gr.update(visible=True), gr.update(visible=False)], None, gradio('Stop-default', 'Generate-default')).then(
         generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then(
         lambda state, left, right: state.update({'textbox-default': left, 'output_textbox': right}), gradio('interface_state', 'textbox-default', 'output_textbox'), None).then(
+        lambda: [gr.update(visible=False), gr.update(visible=True)], None, gradio('Stop-default', 'Generate-default')).then(
         None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['textbox-default'].submit(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda: [gr.update(visible=True), gr.update(visible=False)], None, gradio('Stop-default', 'Generate-default')).then(
         generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then(
         lambda state, left, right: state.update({'textbox-default': left, 'output_textbox': right}), gradio('interface_state', 'textbox-default', 'output_textbox'), None).then(
+        lambda: [gr.update(visible=False), gr.update(visible=True)], None, gradio('Stop-default', 'Generate-default')).then(
         None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['Continue-default'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda: [gr.update(visible=True), gr.update(visible=False)], None, gradio('Stop-default', 'Generate-default')).then(
         generate_reply_wrapper, [shared.gradio['output_textbox']] + gradio(inputs)[1:], gradio(outputs), show_progress=False).then(
         lambda state, left, right: state.update({'textbox-default': left, 'output_textbox': right}), gradio('interface_state', 'textbox-default', 'output_textbox'), None).then(
+        lambda: [gr.update(visible=False), gr.update(visible=True)], None, gradio('Stop-default', 'Generate-default')).then(
         None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['Stop-default'].click(stop_everything_event, None, None, queue=False)
diff --git a/modules/ui_file_saving.py b/modules/ui_file_saving.py
index ac72c623..3a27e1b9 100644
--- a/modules/ui_file_saving.py
+++ b/modules/ui_file_saving.py
@@ -74,7 +74,7 @@ def handle_save_preset_confirm_click(filename, contents):
     try:
         utils.save_file(f"presets/{filename}.yaml", contents)
         available_presets = utils.get_available_presets()
-        output = gr.update(choices=available_presets, value=filename),
+        output = gr.update(choices=available_presets, value=filename)
     except Exception:
         output = gr.update()
         traceback.print_exc()
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 1883fdca..d5116938 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -80,77 +80,60 @@ def create_ui():
                 with gr.Blocks():
                     with gr.Row():
                         with gr.Column():
-                            with gr.Blocks():
-                                for i in range(len(total_mem)):
-                                    shared.gradio[f'gpu_memory_{i}'] = gr.Slider(label=f"gpu-memory in MiB for device :{i}", maximum=total_mem[i], value=default_gpu_mem[i])
+                            for i in range(len(total_mem)):
+                                shared.gradio[f'gpu_memory_{i}'] = gr.Slider(label=f"gpu-memory in MiB for device :{i}", maximum=total_mem[i], value=default_gpu_mem[i])
 
-                                shared.gradio['cpu_memory'] = gr.Slider(label="cpu-memory in MiB", maximum=total_cpu_mem, value=default_cpu_mem)
-
-                            with gr.Blocks():
-                                shared.gradio['transformers_info'] = gr.Markdown('load-in-4bit params:')
-                                shared.gradio['compute_dtype'] = gr.Dropdown(label="compute_dtype", choices=["bfloat16", "float16", "float32"], value=shared.args.compute_dtype)
-                                shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type)
-
-                            shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend)
-                            shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=256, value=shared.args.n_gpu_layers, info='Must be set to more than 0 for your GPU to be used.')
-                            shared.gradio['n_ctx'] = gr.Number(label="n_ctx", precision=0, step=256, value=shared.args.n_ctx, info='Context length. Try lowering this if you run out of memory while loading the model.')
-                            shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
-                            shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, step=1, value=shared.args.n_batch)
+                            shared.gradio['cpu_memory'] = gr.Slider(label="cpu-memory in MiB", maximum=total_cpu_mem, value=default_cpu_mem)
+                            shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=256, value=shared.args.n_gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.')
                             shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=256, value=shared.args.threads)
                             shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
-                            shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=shared.args.wbits if shared.args.wbits > 0 else "None")
-                            shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=shared.args.groupsize if shared.args.groupsize > 0 else "None")
+                            shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, step=1, value=shared.args.n_batch)
+                            shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend)
+                            shared.gradio['n_ctx'] = gr.Number(label="n_ctx", precision=0, step=256, value=shared.args.n_ctx, info='Context length. ⚠️ Lower this value if you can\'t load the model.')
+                            shared.gradio['max_seq_len'] = gr.Number(label='max_seq_len', precision=0, step=256, value=shared.args.max_seq_len, info='Context length. ⚠️ Lower this value if you can\'t load the model.')
+                            shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q6', 'q4'], value=shared.args.cache_type, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.')
+                            shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
                             shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
-                            shared.gradio['max_seq_len'] = gr.Number(label='max_seq_len', precision=0, step=256, value=shared.args.max_seq_len, info='Context length. Try lowering this if you run out of memory while loading the model.')
-                            with gr.Blocks():
-                                shared.gradio['alpha_value'] = gr.Number(label='alpha_value', value=shared.args.alpha_value, precision=2, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.')
-                                shared.gradio['rope_freq_base'] = gr.Number(label='rope_freq_base', value=shared.args.rope_freq_base, precision=0, info='Positional embeddings frequency base for NTK RoPE scaling. Related to alpha_value by rope_freq_base = 10000 * alpha_value ^ (64 / 63). 0 = from model.')
-                                shared.gradio['compress_pos_emb'] = gr.Number(label='compress_pos_emb', value=shared.args.compress_pos_emb, precision=2, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.')
-
-                            shared.gradio['autogptq_info'] = gr.Markdown('ExLlamav2_HF is recommended over AutoGPTQ for models derived from Llama.')
+                            shared.gradio['alpha_value'] = gr.Number(label='alpha_value', value=shared.args.alpha_value, precision=2, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.')
+                            shared.gradio['rope_freq_base'] = gr.Number(label='rope_freq_base', value=shared.args.rope_freq_base, precision=0, info='Positional embeddings frequency base for NTK RoPE scaling. Related to alpha_value by rope_freq_base = 10000 * alpha_value ^ (64 / 63). 0 = from model.')
+                            shared.gradio['compress_pos_emb'] = gr.Number(label='compress_pos_emb', value=shared.args.compress_pos_emb, precision=2, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.')
+                            shared.gradio['compute_dtype'] = gr.Dropdown(label="compute_dtype", choices=["bfloat16", "float16", "float32"], value=shared.args.compute_dtype, info='Used by load-in-4bit.')
+                            shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type, info='Used by load-in-4bit.')
+                            shared.gradio['attention_sink_size'] = gr.Number(label="attention_sink_size", value=shared.args.attention_sink_size, precision=0, info='StreamingLLM: number of sink tokens. Only used if the trimmed prompt doesn\'t share a prefix with the old prompt.')
+                            shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.')
 
                         with gr.Column():
+                            shared.gradio['tensorcores'] = gr.Checkbox(label="tensorcores", value=shared.args.tensorcores, info='NVIDIA only: use llama-cpp-python compiled without GGML_CUDA_FORCE_MMQ. This may improve performance on newer cards.')
                             shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
                             shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
-                            shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant)
-                            shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.')
-                            shared.gradio['use_eager_attention'] = gr.Checkbox(label="use_eager_attention", value=shared.args.use_eager_attention, info='Set attn_implementation= eager while loading the model.')
+                            shared.gradio['torch_compile'] = gr.Checkbox(label="torch-compile", value=shared.args.torch_compile, info='Compile the model with torch.compile for improved performance.')
                             shared.gradio['flash_attn'] = gr.Checkbox(label="flash_attn", value=shared.args.flash_attn, info='Use flash-attention.')
-                            shared.gradio['auto_devices'] = gr.Checkbox(label="auto-devices", value=shared.args.auto_devices)
-                            shared.gradio['tensorcores'] = gr.Checkbox(label="tensorcores", value=shared.args.tensorcores, info='NVIDIA only: use llama-cpp-python compiled with tensor cores support. This may increase performance on newer cards.')
-                            shared.gradio['cache_8bit'] = gr.Checkbox(label="cache_8bit", value=shared.args.cache_8bit, info='Use 8-bit cache to save VRAM.')
-                            shared.gradio['cache_4bit'] = gr.Checkbox(label="cache_4bit", value=shared.args.cache_4bit, info='Use Q4 cache to save VRAM.')
+                            shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.')
                             shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming_llm", value=shared.args.streaming_llm, info='(experimental) Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
-                            shared.gradio['attention_sink_size'] = gr.Number(label="attention_sink_size", value=shared.args.attention_sink_size, precision=0, info='StreamingLLM: number of sink tokens. Only used if the trimmed prompt doesn\'t share a prefix with the old prompt.')
+                            shared.gradio['auto_devices'] = gr.Checkbox(label="auto-devices", value=shared.args.auto_devices)
                             shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.')
+                            shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)
                             shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.')
                             shared.gradio['no_offload_kqv'] = gr.Checkbox(label="no_offload_kqv", value=shared.args.no_offload_kqv, info='Do not offload the  K, Q, V to the GPU. This saves VRAM but reduces the performance.')
                             shared.gradio['no_mul_mat_q'] = gr.Checkbox(label="no_mul_mat_q", value=shared.args.no_mul_mat_q, info='Disable the mulmat kernels.')
-                            shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)
-                            shared.gradio['no_inject_fused_mlp'] = gr.Checkbox(label="no_inject_fused_mlp", value=shared.args.no_inject_fused_mlp, info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.')
-                            shared.gradio['no_use_cuda_fp16'] = gr.Checkbox(label="no_use_cuda_fp16", value=shared.args.no_use_cuda_fp16, info='This can make models faster on some systems.')
-                            shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.')
                             shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap)
                             shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
                             shared.gradio['numa'] = gr.Checkbox(label="numa", value=shared.args.numa, info='NUMA support can help on some systems with non-uniform memory access.')
-                            shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)
+                            shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant, info='Used by load-in-4bit.')
+                            shared.gradio['use_eager_attention'] = gr.Checkbox(label="use_eager_attention", value=shared.args.use_eager_attention, info='Set attn_implementation= eager while loading the model.')
                             shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16)
                             shared.gradio['autosplit'] = gr.Checkbox(label="autosplit", value=shared.args.autosplit, info='Automatically split the model tensors across the available GPUs.')
+                            shared.gradio['enable_tp'] = gr.Checkbox(label="enable_tp", value=shared.args.enable_tp, info='Enable Tensor Parallelism (TP).')
                             shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn)
                             shared.gradio['no_xformers'] = gr.Checkbox(label="no_xformers", value=shared.args.no_xformers)
                             shared.gradio['no_sdpa'] = gr.Checkbox(label="no_sdpa", value=shared.args.no_sdpa)
                             shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.')
                             shared.gradio['cpp_runner'] = gr.Checkbox(label="cpp-runner", value=shared.args.cpp_runner, info='Enable inference with ModelRunnerCpp, which is faster than the default ModelRunner.')
-                            shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.')
-                            with gr.Blocks():
-                                shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code)
-                                shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.')
-                                shared.gradio['logits_all'] = gr.Checkbox(label="logits_all", value=shared.args.logits_all, info='Needs to be set for perplexity evaluation to work with this loader. Otherwise, ignore it, as it makes prompt processing slower.')
-
-                            shared.gradio['disable_exllama'] = gr.Checkbox(label="disable_exllama", value=shared.args.disable_exllama, info='Disable ExLlama kernel for GPTQ models.')
-                            shared.gradio['disable_exllamav2'] = gr.Checkbox(label="disable_exllamav2", value=shared.args.disable_exllamav2, info='Disable ExLlamav2 kernel for GPTQ models.')
-                            shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.")
+                            shared.gradio['logits_all'] = gr.Checkbox(label="logits_all", value=shared.args.logits_all, info='Needs to be set for perplexity evaluation to work with this loader. Otherwise, ignore it, as it makes prompt processing slower.')
+                            shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code)
+                            shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.')
                             shared.gradio['llamacpp_HF_info'] = gr.Markdown("llamacpp_HF loads llama.cpp as a Transformers model. To use it, you need to place your GGUF in a subfolder of models/ with the necessary tokenizer files.\n\nYou can use the \"llamacpp_HF creator\" menu to do that automatically.")
+                            shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.")
                             shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `max_seq_len` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.')
 
             with gr.Column():
@@ -194,13 +177,13 @@ def create_event_handlers():
     shared.gradio['model_menu'].change(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         handle_load_model_event_initial, gradio('model_menu', 'interface_state'), gradio(ui.list_interface_input_elements()) + gradio('interface_state'), show_progress=False).then(
-        load_model_wrapper, gradio('model_menu', 'loader', 'autoload_model'), gradio('model_status'), show_progress=False).success(
+        load_model_wrapper, gradio('model_menu', 'loader', 'autoload_model'), gradio('model_status'), show_progress=True).success(
         handle_load_model_event_final, gradio('truncation_length', 'loader', 'interface_state'), gradio('truncation_length', 'filter_by_loader'), show_progress=False)
 
     shared.gradio['load_model'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         update_model_parameters, gradio('interface_state'), None).then(
-        partial(load_model_wrapper, autoload=True), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=False).success(
+        partial(load_model_wrapper, autoload=True), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=True).success(
         handle_load_model_event_final, gradio('truncation_length', 'loader', 'interface_state'), gradio('truncation_length', 'filter_by_loader'), show_progress=False)
 
     shared.gradio['unload_model'].click(handle_unload_model_click, None, gradio('model_status'), show_progress=False)
@@ -259,6 +242,8 @@ def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), retur
             yield ("Please enter a model path")
             return
 
+        repo_id = repo_id.strip()
+        specific_file = specific_file.strip()
         downloader = importlib.import_module("download-model").ModelDownloader()
 
         progress(0.0)
@@ -296,7 +281,7 @@ def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), retur
             downloader.check_model_files(model, branch, links, sha256, output_folder)
             progress(1.0)
         else:
-            yield (f"Downloading file{'s' if len(links) > 1 else ''} to `{output_folder}`")
+            yield (f"Downloading file{'s' if len(links) > 1 else ''} to `{output_folder}/`")
             downloader.download_model_files(model, branch, links, sha256, output_folder, progress_bar=progress, threads=4, is_llamacpp=is_llamacpp)
 
             yield (f"Model successfully saved to `{output_folder}/`.")
@@ -316,7 +301,7 @@ def create_llamacpp_hf(gguf_name, unquantized_url, progress=gr.Progress()):
         links, sha256, is_lora, is_llamacpp = downloader.get_download_links_from_huggingface(model, branch, text_only=True)
         output_folder = Path(shared.args.model_dir) / (re.sub(r'(?i)\.gguf$', '', gguf_name) + "-HF")
 
-        yield (f"Downloading tokenizer to `{output_folder}`")
+        yield (f"Downloading tokenizer to `{output_folder}/`")
         downloader.download_model_files(model, branch, links, sha256, output_folder, progress_bar=progress, threads=4, is_llamacpp=False)
 
         # Move the GGUF
diff --git a/modules/ui_notebook.py b/modules/ui_notebook.py
index 79932844..b234ac57 100644
--- a/modules/ui_notebook.py
+++ b/modules/ui_notebook.py
@@ -23,7 +23,7 @@ def create_ui():
                 with gr.Tab('Raw'):
                     with gr.Row():
                         shared.gradio['textbox-notebook'] = gr.Textbox(value='', lines=27, elem_id='textbox-notebook', elem_classes=['textbox', 'add_scrollbar'])
-                        shared.gradio['token-counter-notebook'] = gr.HTML(value="<span>0</span>", elem_classes=["token-counter"])
+                        shared.gradio['token-counter-notebook'] = gr.HTML(value="<span>0</span>", elem_id="notebook-token-counter")
 
                 with gr.Tab('Markdown'):
                     shared.gradio['markdown_render-notebook'] = gr.Button('Render')
@@ -48,10 +48,10 @@ def create_ui():
                     shared.gradio['tokens-notebook'] = gr.Textbox(lines=23, label='Tokens', elem_classes=['textbox_logits_notebook', 'add_scrollbar', 'monospace'])
 
                 with gr.Row():
-                    shared.gradio['Generate-notebook'] = gr.Button('Generate', variant='primary', elem_classes='small-button')
-                    shared.gradio['Stop-notebook'] = gr.Button('Stop', elem_classes='small-button', elem_id='stop')
                     shared.gradio['Undo'] = gr.Button('Undo', elem_classes='small-button')
                     shared.gradio['Regenerate-notebook'] = gr.Button('Regenerate', elem_classes='small-button')
+                    shared.gradio['Stop-notebook'] = gr.Button('Stop', visible=False, elem_classes='small-button', elem_id='stop')
+                    shared.gradio['Generate-notebook'] = gr.Button('Generate', variant='primary', elem_classes='small-button')
 
             with gr.Column(scale=1):
                 gr.HTML('<div style="padding-bottom: 13px"></div>')
@@ -66,22 +66,28 @@ def create_event_handlers():
     shared.gradio['Generate-notebook'].click(
         lambda x: x, gradio('textbox-notebook'), gradio('last_input-notebook')).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda: [gr.update(visible=True), gr.update(visible=False)], None, gradio('Stop-notebook', 'Generate-notebook')).then(
         generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then(
         lambda state, text: state.update({'textbox-notebook': text}), gradio('interface_state', 'textbox-notebook'), None).then(
+        lambda: [gr.update(visible=False), gr.update(visible=True)], None, gradio('Stop-notebook', 'Generate-notebook')).then(
         None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['textbox-notebook'].submit(
         lambda x: x, gradio('textbox-notebook'), gradio('last_input-notebook')).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda: [gr.update(visible=True), gr.update(visible=False)], None, gradio('Stop-notebook', 'Generate-notebook')).then(
         generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then(
         lambda state, text: state.update({'textbox-notebook': text}), gradio('interface_state', 'textbox-notebook'), None).then(
+        lambda: [gr.update(visible=False), gr.update(visible=True)], None, gradio('Stop-notebook', 'Generate-notebook')).then(
         None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['Regenerate-notebook'].click(
         lambda x: x, gradio('last_input-notebook'), gradio('textbox-notebook'), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda: [gr.update(visible=True), gr.update(visible=False)], None, gradio('Stop-notebook', 'Generate-notebook')).then(
         generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then(
         lambda state, text: state.update({'textbox-notebook': text}), gradio('interface_state', 'textbox-notebook'), None).then(
+        lambda: [gr.update(visible=False), gr.update(visible=True)], None, gradio('Stop-notebook', 'Generate-notebook')).then(
         None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['Undo'].click(
diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py
index eff62c20..c8fd6bc7 100644
--- a/modules/ui_parameters.py
+++ b/modules/ui_parameters.py
@@ -27,75 +27,80 @@ def create_ui(default_preset):
                 with gr.Column():
                     with gr.Row():
                         with gr.Column():
-                            shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], step=1, label='max_new_tokens', value=shared.settings['max_new_tokens'])
+                            gr.Markdown('## Curve shape')
                             shared.gradio['temperature'] = gr.Slider(0.01, 5, value=generate_params['temperature'], step=0.01, label='temperature')
-                            shared.gradio['top_p'] = gr.Slider(0.0, 1.0, value=generate_params['top_p'], step=0.01, label='top_p')
-                            shared.gradio['top_k'] = gr.Slider(0, 200, value=generate_params['top_k'], step=1, label='top_k')
-                            shared.gradio['typical_p'] = gr.Slider(0.0, 1.0, value=generate_params['typical_p'], step=0.01, label='typical_p')
-                            shared.gradio['min_p'] = gr.Slider(0.0, 1.0, value=generate_params['min_p'], step=0.01, label='min_p')
-                            shared.gradio['repetition_penalty'] = gr.Slider(1.0, 1.5, value=generate_params['repetition_penalty'], step=0.01, label='repetition_penalty')
-                            shared.gradio['frequency_penalty'] = gr.Slider(0, 2, value=generate_params['frequency_penalty'], step=0.05, label='frequency_penalty')
-                            shared.gradio['presence_penalty'] = gr.Slider(0, 2, value=generate_params['presence_penalty'], step=0.05, label='presence_penalty')
-                            shared.gradio['repetition_penalty_range'] = gr.Slider(0, 4096, step=64, value=generate_params['repetition_penalty_range'], label='repetition_penalty_range')
-                            shared.gradio['do_sample'] = gr.Checkbox(value=generate_params['do_sample'], label='do_sample')
-
-                            with gr.Blocks():
-                                shared.gradio['dry_multiplier'] = gr.Slider(0, 5, value=generate_params['dry_multiplier'], step=0.01, label='dry_multiplier', info='Set to greater than 0 to enable DRY. Recommended value: 0.8.')
-                                shared.gradio['dry_allowed_length'] = gr.Slider(1, 20, value=generate_params['dry_allowed_length'], step=1, label='dry_allowed_length', info='Longest sequence that can be repeated without being penalized.')
-                                shared.gradio['dry_base'] = gr.Slider(1, 4, value=generate_params['dry_base'], step=0.01, label='dry_base', info='Controls how fast the penalty grows with increasing sequence length.')
-                                shared.gradio['dry_sequence_breakers'] = gr.Textbox(value=generate_params['dry_sequence_breakers'], label='dry_sequence_breakers', info='Tokens across which sequence matching is not continued. Specified as a comma-separated list of quoted strings.')
-
-                            gr.Markdown("[Learn more](https://github.com/oobabooga/text-generation-webui/wiki/03-%E2%80%90-Parameters-Tab)")
-
-                        with gr.Column():
-                            with gr.Group():
-                                shared.gradio['auto_max_new_tokens'] = gr.Checkbox(value=shared.settings['auto_max_new_tokens'], label='auto_max_new_tokens', info='Expand max_new_tokens to the available context length.')
-                                shared.gradio['ban_eos_token'] = gr.Checkbox(value=shared.settings['ban_eos_token'], label='Ban the eos_token', info='Forces the model to never end the generation prematurely.')
-                                shared.gradio['add_bos_token'] = gr.Checkbox(value=shared.settings['add_bos_token'], label='Add the bos_token to the beginning of prompts', info='Disabling this can make the replies more creative.')
-                                shared.gradio['custom_stopping_strings'] = gr.Textbox(lines=2, value=shared.settings["custom_stopping_strings"] or None, label='Custom stopping strings', info='Written between "" and separated by commas.', placeholder='"\\n", "\\nYou:"')
-                                shared.gradio['custom_token_bans'] = gr.Textbox(value=shared.settings['custom_token_bans'] or None, label='Token bans', info='Token IDs to ban, separated by commas. The IDs can be found in the Default or Notebook tab.')
-
-                            shared.gradio['penalty_alpha'] = gr.Slider(0, 5, value=generate_params['penalty_alpha'], label='penalty_alpha', info='For Contrastive Search. do_sample must be unchecked.')
-                            shared.gradio['guidance_scale'] = gr.Slider(-0.5, 2.5, step=0.05, value=generate_params['guidance_scale'], label='guidance_scale', info='For CFG. 1.5 is a good value.')
-                            shared.gradio['negative_prompt'] = gr.Textbox(value=shared.settings['negative_prompt'], label='Negative prompt', lines=3, elem_classes=['add_scrollbar'])
-                            shared.gradio['mirostat_mode'] = gr.Slider(0, 2, step=1, value=generate_params['mirostat_mode'], label='mirostat_mode', info='mode=1 is for llama.cpp only.')
-                            shared.gradio['mirostat_tau'] = gr.Slider(0, 10, step=0.01, value=generate_params['mirostat_tau'], label='mirostat_tau')
-                            shared.gradio['mirostat_eta'] = gr.Slider(0, 1, step=0.01, value=generate_params['mirostat_eta'], label='mirostat_eta')
-                            shared.gradio['epsilon_cutoff'] = gr.Slider(0, 9, value=generate_params['epsilon_cutoff'], step=0.01, label='epsilon_cutoff')
-                            shared.gradio['eta_cutoff'] = gr.Slider(0, 20, value=generate_params['eta_cutoff'], step=0.01, label='eta_cutoff')
-                            shared.gradio['encoder_repetition_penalty'] = gr.Slider(0.8, 1.5, value=generate_params['encoder_repetition_penalty'], step=0.01, label='encoder_repetition_penalty')
-                            shared.gradio['no_repeat_ngram_size'] = gr.Slider(0, 20, step=1, value=generate_params['no_repeat_ngram_size'], label='no_repeat_ngram_size')
-
-                with gr.Column():
-                    with gr.Row() as shared.gradio['grammar_file_row']:
-                        shared.gradio['grammar_file'] = gr.Dropdown(value='None', choices=utils.get_available_grammars(), label='Load grammar from file (.gbnf)', elem_classes='slim-dropdown')
-                        ui.create_refresh_button(shared.gradio['grammar_file'], lambda: None, lambda: {'choices': utils.get_available_grammars()}, 'refresh-button', interactive=not mu)
-                        shared.gradio['save_grammar'] = gr.Button('💾', elem_classes='refresh-button', interactive=not mu)
-                        shared.gradio['delete_grammar'] = gr.Button('🗑️ ', elem_classes='refresh-button', interactive=not mu)
-
-                    shared.gradio['grammar_string'] = gr.Textbox(value='', label='Grammar', lines=16, elem_classes=['add_scrollbar', 'monospace'])
-
-                    with gr.Row():
-                        with gr.Column():
-                            shared.gradio['tfs'] = gr.Slider(0.0, 1.0, value=generate_params['tfs'], step=0.01, label='tfs')
-                            shared.gradio['top_a'] = gr.Slider(0.0, 1.0, value=generate_params['top_a'], step=0.01, label='top_a')
-                            shared.gradio['smoothing_factor'] = gr.Slider(0.0, 10.0, value=generate_params['smoothing_factor'], step=0.01, label='smoothing_factor', info='Activates Quadratic Sampling.')
-                            shared.gradio['smoothing_curve'] = gr.Slider(1.0, 10.0, value=generate_params['smoothing_curve'], step=0.01, label='smoothing_curve', info='Adjusts the dropoff curve of Quadratic Sampling.')
-                            shared.gradio['dynamic_temperature'] = gr.Checkbox(value=generate_params['dynamic_temperature'], label='dynamic_temperature')
                             shared.gradio['dynatemp_low'] = gr.Slider(0.01, 5, value=generate_params['dynatemp_low'], step=0.01, label='dynatemp_low', visible=generate_params['dynamic_temperature'])
                             shared.gradio['dynatemp_high'] = gr.Slider(0.01, 5, value=generate_params['dynatemp_high'], step=0.01, label='dynatemp_high', visible=generate_params['dynamic_temperature'])
                             shared.gradio['dynatemp_exponent'] = gr.Slider(0.01, 5, value=generate_params['dynatemp_exponent'], step=0.01, label='dynatemp_exponent', visible=generate_params['dynamic_temperature'])
-                            shared.gradio['temperature_last'] = gr.Checkbox(value=generate_params['temperature_last'], label='temperature_last', info='Moves temperature/dynamic temperature/quadratic sampling to the end of the sampler stack, ignoring their positions in "Sampler priority".')
-                            shared.gradio['sampler_priority'] = gr.Textbox(value=generate_params['sampler_priority'], lines=12, label='Sampler priority', info='Parameter names separated by new lines or commas.')
+                            shared.gradio['smoothing_factor'] = gr.Slider(0.0, 10.0, value=generate_params['smoothing_factor'], step=0.01, label='smoothing_factor', info='Activates Quadratic Sampling.')
+                            shared.gradio['smoothing_curve'] = gr.Slider(1.0, 10.0, value=generate_params['smoothing_curve'], step=0.01, label='smoothing_curve', info='Adjusts the dropoff curve of Quadratic Sampling.')
+
+                            gr.Markdown('## Curve cutoff')
+                            shared.gradio['min_p'] = gr.Slider(0.0, 1.0, value=generate_params['min_p'], step=0.01, label='min_p')
+                            shared.gradio['top_p'] = gr.Slider(0.0, 1.0, value=generate_params['top_p'], step=0.01, label='top_p')
+                            shared.gradio['top_k'] = gr.Slider(0, 200, value=generate_params['top_k'], step=1, label='top_k')
+                            shared.gradio['typical_p'] = gr.Slider(0.0, 1.0, value=generate_params['typical_p'], step=0.01, label='typical_p')
+                            shared.gradio['xtc_threshold'] = gr.Slider(0, 0.5, value=generate_params['xtc_threshold'], step=0.01, label='xtc_threshold', info='If 2 or more tokens have probability above this threshold, consider removing all but the last one.')
+                            shared.gradio['xtc_probability'] = gr.Slider(0, 1, value=generate_params['xtc_probability'], step=0.01, label='xtc_probability', info='Probability that the removal will actually happen. 0 disables the sampler. 1 makes it always happen.')
+                            shared.gradio['epsilon_cutoff'] = gr.Slider(0, 9, value=generate_params['epsilon_cutoff'], step=0.01, label='epsilon_cutoff')
+                            shared.gradio['eta_cutoff'] = gr.Slider(0, 20, value=generate_params['eta_cutoff'], step=0.01, label='eta_cutoff')
+                            shared.gradio['tfs'] = gr.Slider(0.0, 1.0, value=generate_params['tfs'], step=0.01, label='tfs')
+                            shared.gradio['top_a'] = gr.Slider(0.0, 1.0, value=generate_params['top_a'], step=0.01, label='top_a')
+
+                            gr.Markdown('## Repetition suppression')
+                            shared.gradio['dry_multiplier'] = gr.Slider(0, 5, value=generate_params['dry_multiplier'], step=0.01, label='dry_multiplier', info='Set to greater than 0 to enable DRY. Recommended value: 0.8.')
+                            shared.gradio['dry_allowed_length'] = gr.Slider(1, 20, value=generate_params['dry_allowed_length'], step=1, label='dry_allowed_length', info='Longest sequence that can be repeated without being penalized.')
+                            shared.gradio['dry_base'] = gr.Slider(1, 4, value=generate_params['dry_base'], step=0.01, label='dry_base', info='Controls how fast the penalty grows with increasing sequence length.')
+                            shared.gradio['repetition_penalty'] = gr.Slider(1.0, 1.5, value=generate_params['repetition_penalty'], step=0.01, label='repetition_penalty')
+                            shared.gradio['frequency_penalty'] = gr.Slider(0, 2, value=generate_params['frequency_penalty'], step=0.05, label='frequency_penalty')
+                            shared.gradio['presence_penalty'] = gr.Slider(0, 2, value=generate_params['presence_penalty'], step=0.05, label='presence_penalty')
+                            shared.gradio['encoder_repetition_penalty'] = gr.Slider(0.8, 1.5, value=generate_params['encoder_repetition_penalty'], step=0.01, label='encoder_repetition_penalty')
+                            shared.gradio['no_repeat_ngram_size'] = gr.Slider(0, 20, step=1, value=generate_params['no_repeat_ngram_size'], label='no_repeat_ngram_size')
+                            shared.gradio['repetition_penalty_range'] = gr.Slider(0, 4096, step=64, value=generate_params['repetition_penalty_range'], label='repetition_penalty_range')
 
                         with gr.Column():
-                            shared.gradio['truncation_length'] = gr.Number(precision=0, step=256, value=get_truncation_length(), label='Truncate the prompt up to this length', info='The leftmost tokens are removed if the prompt exceeds this length. Most models require this to be at most 2048.')
+                            gr.Markdown('## Alternative sampling methods')
+                            shared.gradio['penalty_alpha'] = gr.Slider(0, 5, value=generate_params['penalty_alpha'], label='penalty_alpha', info='For Contrastive Search. do_sample must be unchecked.')
+                            shared.gradio['guidance_scale'] = gr.Slider(-0.5, 2.5, step=0.05, value=generate_params['guidance_scale'], label='guidance_scale', info='For CFG. 1.5 is a good value.')
+                            shared.gradio['mirostat_mode'] = gr.Slider(0, 2, step=1, value=generate_params['mirostat_mode'], label='mirostat_mode', info='mode=1 is for llama.cpp only.')
+                            shared.gradio['mirostat_tau'] = gr.Slider(0, 10, step=0.01, value=generate_params['mirostat_tau'], label='mirostat_tau')
+                            shared.gradio['mirostat_eta'] = gr.Slider(0, 1, step=0.01, value=generate_params['mirostat_eta'], label='mirostat_eta')
+
+                            gr.Markdown('## Other options')
+                            shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], value=shared.settings['max_new_tokens'], step=1, label='max_new_tokens', info='⚠️ Setting this too high can cause prompt truncation.')
                             shared.gradio['prompt_lookup_num_tokens'] = gr.Slider(value=shared.settings['prompt_lookup_num_tokens'], minimum=0, maximum=10, step=1, label='prompt_lookup_num_tokens', info='Activates Prompt Lookup Decoding.')
                             shared.gradio['max_tokens_second'] = gr.Slider(value=shared.settings['max_tokens_second'], minimum=0, maximum=20, step=1, label='Maximum tokens/second', info='To make text readable in real time.')
                             shared.gradio['max_updates_second'] = gr.Slider(value=shared.settings['max_updates_second'], minimum=0, maximum=24, step=1, label='Maximum UI updates/second', info='Set this if you experience lag in the UI during streaming.')
-                            shared.gradio['seed'] = gr.Number(value=shared.settings['seed'], label='Seed (-1 for random)')
+
+                with gr.Column():
+                    with gr.Row():
+                        with gr.Column():
+                            shared.gradio['do_sample'] = gr.Checkbox(value=generate_params['do_sample'], label='do_sample')
+                            shared.gradio['dynamic_temperature'] = gr.Checkbox(value=generate_params['dynamic_temperature'], label='dynamic_temperature')
+                            shared.gradio['temperature_last'] = gr.Checkbox(value=generate_params['temperature_last'], label='temperature_last', info='Moves temperature/dynamic temperature/quadratic sampling to the end of the sampler stack, ignoring their positions in "Sampler priority".')
+                            shared.gradio['auto_max_new_tokens'] = gr.Checkbox(value=shared.settings['auto_max_new_tokens'], label='auto_max_new_tokens', info='Expand max_new_tokens to the available context length.')
+                            shared.gradio['ban_eos_token'] = gr.Checkbox(value=shared.settings['ban_eos_token'], label='Ban the eos_token', info='Forces the model to never end the generation prematurely.')
+                            shared.gradio['add_bos_token'] = gr.Checkbox(value=shared.settings['add_bos_token'], label='Add the bos_token to the beginning of prompts', info='Disabling this can make the replies more creative.')
                             shared.gradio['skip_special_tokens'] = gr.Checkbox(value=shared.settings['skip_special_tokens'], label='Skip special tokens', info='Some specific models need this unset.')
                             shared.gradio['stream'] = gr.Checkbox(value=shared.settings['stream'], label='Activate text streaming')
+                            shared.gradio['static_cache'] = gr.Checkbox(value=shared.settings['static_cache'], label='Static KV cache', info='Use a static cache for improved performance.')
+
+                        with gr.Column():
+                            shared.gradio['truncation_length'] = gr.Number(precision=0, step=256, value=get_truncation_length(), label='Truncate the prompt up to this length', info='The leftmost tokens are removed if the prompt exceeds this length. Most models require this to be at most 2048.')
+                            shared.gradio['seed'] = gr.Number(value=shared.settings['seed'], label='Seed (-1 for random)')
+
+                            shared.gradio['sampler_priority'] = gr.Textbox(value=generate_params['sampler_priority'], lines=12, label='Sampler priority', info='Parameter names separated by new lines or commas.', elem_classes=['add_scrollbar'])
+                            shared.gradio['custom_stopping_strings'] = gr.Textbox(lines=2, value=shared.settings["custom_stopping_strings"] or None, label='Custom stopping strings', info='Written between "" and separated by commas.', placeholder='"\\n", "\\nYou:"')
+                            shared.gradio['custom_token_bans'] = gr.Textbox(value=shared.settings['custom_token_bans'] or None, label='Token bans', info='Token IDs to ban, separated by commas. The IDs can be found in the Default or Notebook tab.')
+                            shared.gradio['negative_prompt'] = gr.Textbox(value=shared.settings['negative_prompt'], label='Negative prompt', info='For CFG. Only used when guidance_scale is different than 1.', lines=3, elem_classes=['add_scrollbar'])
+                            shared.gradio['dry_sequence_breakers'] = gr.Textbox(value=generate_params['dry_sequence_breakers'], label='dry_sequence_breakers', info='Tokens across which sequence matching is not continued. Specified as a comma-separated list of quoted strings.')
+                            with gr.Row() as shared.gradio['grammar_file_row']:
+                                shared.gradio['grammar_file'] = gr.Dropdown(value='None', choices=utils.get_available_grammars(), label='Load grammar from file (.gbnf)', elem_classes='slim-dropdown')
+                                ui.create_refresh_button(shared.gradio['grammar_file'], lambda: None, lambda: {'choices': utils.get_available_grammars()}, 'refresh-button', interactive=not mu)
+                                shared.gradio['save_grammar'] = gr.Button('💾', elem_classes='refresh-button', interactive=not mu)
+                                shared.gradio['delete_grammar'] = gr.Button('🗑️ ', elem_classes='refresh-button', interactive=not mu)
+
+                            shared.gradio['grammar_string'] = gr.Textbox(value='', label='Grammar', lines=16, elem_classes=['add_scrollbar', 'monospace'])
 
         ui_chat.create_chat_settings_ui()
 
diff --git a/one_click.py b/one_click.py
index 0a0412ba..e78a2450 100644
--- a/one_click.py
+++ b/one_click.py
@@ -16,9 +16,9 @@ import sys
 
 
 # Define the required PyTorch version
-TORCH_VERSION = "2.2.2"
-TORCHVISION_VERSION = "0.17.2"
-TORCHAUDIO_VERSION = "2.2.2"
+TORCH_VERSION = "2.4.1"
+TORCHVISION_VERSION = "0.19.1"
+TORCHAUDIO_VERSION = "2.4.1"
 
 # Environment
 script_dir = os.getcwd()
@@ -117,7 +117,7 @@ def update_pytorch():
     elif is_cuda:
         install_pytorch += "--index-url https://download.pytorch.org/whl/cu121"
     elif is_rocm:
-        install_pytorch += "--index-url https://download.pytorch.org/whl/rocm5.6"
+        install_pytorch += "--index-url https://download.pytorch.org/whl/rocm6.1"
     elif is_cpu:
         install_pytorch += "--index-url https://download.pytorch.org/whl/cpu"
     elif is_intel:
@@ -189,8 +189,11 @@ def run_cmd(cmd, assert_success=False, environment=False, capture_output=False,
             conda_sh_path = os.path.join(script_dir, "installer_files", "conda", "etc", "profile.d", "conda.sh")
             cmd = f'. "{conda_sh_path}" && conda activate "{conda_env_path}" && {cmd}'
 
+    # Set executable to None for Windows, bash for everything else
+    executable = None if is_windows() else 'bash'
+
     # Run shell commands
-    result = subprocess.run(cmd, shell=True, capture_output=capture_output, env=env)
+    result = subprocess.run(cmd, shell=True, capture_output=capture_output, env=env, executable=executable)
 
     # Assert the command ran successfully
     if assert_success and result.returncode != 0:
@@ -229,33 +232,45 @@ def get_user_choice(question, options_dict):
 
 
 def install_webui():
-
     # Ask the user for the GPU vendor
     if "GPU_CHOICE" in os.environ:
         choice = os.environ["GPU_CHOICE"].upper()
         print_big_message(f"Selected GPU choice \"{choice}\" based on the GPU_CHOICE environment variable.")
+
+        # Warn about changed meanings and handle old NVIDIA choice
+        if choice == "B":
+            print_big_message("Warning: GPU_CHOICE='B' now means 'NVIDIA (CUDA 11.8)' in the new version.")
+        elif choice == "C":
+            print_big_message("Warning: GPU_CHOICE='C' now means 'AMD' in the new version.")
+        elif choice == "D":
+            print_big_message("Warning: GPU_CHOICE='D' now means 'Apple M Series' in the new version.")
+        elif choice == "A" and "USE_CUDA118" in os.environ:
+            choice = "B" if os.environ.get("USE_CUDA118", "").lower() in ("yes", "y", "true", "1", "t", "on") else "A"
     else:
         choice = get_user_choice(
             "What is your GPU?",
             {
-                'A': 'NVIDIA',
-                'B': 'AMD (Linux/MacOS only. Requires ROCm SDK 5.6 on Linux)',
-                'C': 'Apple M Series',
-                'D': 'Intel Arc (IPEX)',
-                'N': 'None (I want to run models in CPU mode)'
+                'A': 'NVIDIA - CUDA 12.1 (recommended)',
+                'B': 'NVIDIA - CUDA 11.8 (legacy GPUs)',
+                'C': 'AMD - Linux/macOS only, requires ROCm 6.1',
+                'D': 'Apple M Series',
+                'E': 'Intel Arc (beta)',
+                'N': 'CPU mode'
             },
         )
 
+    # Convert choices to GPU names for compatibility
     gpu_choice_to_name = {
         "A": "NVIDIA",
-        "B": "AMD",
-        "C": "APPLE",
-        "D": "INTEL",
+        "B": "NVIDIA",
+        "C": "AMD",
+        "D": "APPLE",
+        "E": "INTEL",
         "N": "NONE"
     }
 
     selected_gpu = gpu_choice_to_name[choice]
-    use_cuda118 = "N"
+    use_cuda118 = (choice == "B")  # CUDA version is now determined by menu choice
 
     # Write a flag to CMD_FLAGS.txt for CPU mode
     if selected_gpu == "NONE":
@@ -264,18 +279,9 @@ def install_webui():
                 print_big_message("Adding the --cpu flag to CMD_FLAGS.txt.")
                 cmd_flags_file.write("\n--cpu\n")
 
-    # Check if the user wants CUDA 11.8
+    # Handle CUDA version display
     elif any((is_windows(), is_linux())) and selected_gpu == "NVIDIA":
-        if "USE_CUDA118" in os.environ:
-            use_cuda118 = "Y" if os.environ.get("USE_CUDA118", "").lower() in ("yes", "y", "true", "1", "t", "on") else "N"
-        else:
-            print("\nDo you want to use CUDA 11.8 instead of 12.1?\nOnly choose this option if your GPU is very old (Kepler or older).\n\nFor RTX and GTX series GPUs, say \"N\".\nIf unsure, say \"N\".\n")
-            use_cuda118 = input("Input (Y/N)> ").upper().strip('"\'').strip()
-            while use_cuda118 not in 'YN':
-                print("Invalid choice. Please try again.")
-                use_cuda118 = input("Input> ").upper().strip('"\'').strip()
-
-        if use_cuda118 == 'Y':
+        if use_cuda118:
             print("CUDA: 11.8")
         else:
             print("CUDA: 12.1")
@@ -294,7 +300,7 @@ def install_webui():
         else:
             install_pytorch += "--index-url https://download.pytorch.org/whl/cu121"
     elif selected_gpu == "AMD":
-        install_pytorch += "--index-url https://download.pytorch.org/whl/rocm5.6"
+        install_pytorch += "--index-url https://download.pytorch.org/whl/rocm6.1"
     elif selected_gpu in ["APPLE", "NONE"]:
         install_pytorch += "--index-url https://download.pytorch.org/whl/cpu"
     elif selected_gpu == "INTEL":
@@ -310,7 +316,7 @@ def install_webui():
     if selected_gpu == "INTEL":
         # Install oneAPI dependencies via conda
         print_big_message("Installing Intel oneAPI runtime libraries.")
-        run_cmd("conda install -y -c intel dpcpp-cpp-rt=2024.0 mkl-dpcpp=2024.0")
+        run_cmd("conda install -y -c https://software.repos.intel.com/python/conda/ -c conda-forge dpcpp-cpp-rt=2024.0 mkl-dpcpp=2024.0")
         # Install libuv required by Intel-patched torch
         run_cmd("conda install -y libuv")
 
@@ -326,7 +332,7 @@ def install_extensions_requirements():
     print_big_message("Installing extensions requirements.\nSome of these may fail on Windows.\nDon\'t worry if you see error messages, as they will not affect the main program.")
     extensions = get_extensions_names()
     for i, extension in enumerate(extensions):
-        print(f"\n\n--- [{i+1}/{len(extensions)}]: {extension}\n\n")
+        print(f"\n\n--- [{i + 1}/{len(extensions)}]: {extension}\n\n")
         extension_req_path = os.path.join("extensions", extension, "requirements.txt")
         run_cmd(f"python -m pip install -r {extension_req_path} --upgrade", assert_success=False, environment=True)
 
@@ -391,7 +397,7 @@ def update_requirements(initial_installation=False, pull=True):
         textgen_requirements = [
             req.replace('+cu121', '+cu118').replace('+cu122', '+cu118')
             for req in textgen_requirements
-            if "auto-gptq" not in req.lower() and "autoawq" not in req.lower()
+            if "autoawq" not in req.lower()
         ]
 
     if is_windows() and is_cuda118:  # No flash-attention on Windows for CUDA 11
diff --git a/presets/Big O.yaml b/presets/Big O.yaml
deleted file mode 100644
index 2ab18268..00000000
--- a/presets/Big O.yaml	
+++ /dev/null
@@ -1,6 +0,0 @@
-temperature: 0.87
-top_p: 0.99
-typical_p: 0.68
-tfs: 0.68
-repetition_penalty: 1.01
-top_k: 85
diff --git a/presets/Creative.yaml b/presets/Creative.yaml
new file mode 100644
index 00000000..3ed04190
--- /dev/null
+++ b/presets/Creative.yaml
@@ -0,0 +1,2 @@
+min_p: 0.02
+xtc_probability: 0.5
diff --git a/presets/Debug-deterministic.yaml b/presets/Deterministic.yaml
similarity index 100%
rename from presets/Debug-deterministic.yaml
rename to presets/Deterministic.yaml
diff --git a/presets/Divine Intellect.yaml b/presets/Divine Intellect.yaml
deleted file mode 100644
index ac750e40..00000000
--- a/presets/Divine Intellect.yaml	
+++ /dev/null
@@ -1,4 +0,0 @@
-temperature: 1.31
-top_p: 0.14
-repetition_penalty: 1.17
-top_k: 49
diff --git a/presets/Instruct.yaml b/presets/Instruct.yaml
new file mode 100644
index 00000000..142fcd82
--- /dev/null
+++ b/presets/Instruct.yaml
@@ -0,0 +1 @@
+min_p: 0.2
diff --git a/presets/LLaMA-Precise.yaml b/presets/LLaMA-Precise.yaml
deleted file mode 100644
index c5f9cae2..00000000
--- a/presets/LLaMA-Precise.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-temperature: 0.7
-top_p: 0.1
-repetition_penalty: 1.18
-top_k: 40
diff --git a/presets/Midnight Enigma.yaml b/presets/Midnight Enigma.yaml
deleted file mode 100644
index 0bd1763c..00000000
--- a/presets/Midnight Enigma.yaml	
+++ /dev/null
@@ -1,4 +0,0 @@
-temperature: 0.98
-top_p: 0.37
-repetition_penalty: 1.18
-top_k: 100
diff --git a/presets/Shortwave.yaml b/presets/Shortwave.yaml
deleted file mode 100644
index a2528abd..00000000
--- a/presets/Shortwave.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-temperature: 1.53
-top_p: 0.64
-repetition_penalty: 1.07
-top_k: 33
diff --git a/presets/Yara.yaml b/presets/Yara.yaml
deleted file mode 100644
index 87bb019e..00000000
--- a/presets/Yara.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-temperature: 0.82
-top_p: 0.21
-repetition_penalty: 1.19
-top_k: 72
diff --git a/presets/simple-1.yaml b/presets/simple-1.yaml
deleted file mode 100644
index 30a10659..00000000
--- a/presets/simple-1.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-temperature: 0.7
-top_p: 0.9
-repetition_penalty: 1.15
-top_k: 20
diff --git a/requirements.txt b/requirements.txt
index 08b7d56d..18872431 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,30 +1,27 @@
-accelerate==0.33.*
-aqlm[gpu,cpu]==1.1.6; platform_system == "Linux"
-auto-gptq==0.7.1
-bitsandbytes==0.43.*
+accelerate==1.2.*
+bitsandbytes==0.45.*
 colorama
 datasets
 einops
-gradio==4.26.*
-hqq==0.1.7.post3
-jinja2==3.1.4
-lm_eval==0.3.0
+fastapi==0.112.4
+gradio==4.37.*
+jinja2==3.1.5
 markdown
 numba==0.59.*
 numpy==1.26.*
-optimum==1.17.*
 pandas
 peft==0.12.*
 Pillow>=9.5.0
 psutil
+pydantic==2.8.2
 pyyaml
 requests
 rich
-safetensors==0.4.*
+safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.44.*
+transformers==4.48.*
 tqdm
 wandb
 
@@ -35,38 +32,30 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
-# llama-cpp-python (CUDA, no tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.89+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.89+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.89+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.89+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+# llama-cpp-python (CUDA, with GGML_CUDA_FORCE_MMQ)
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.6+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.6+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.6+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.6+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
-# llama-cpp-python (CUDA, tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.89+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.89+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.89+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.89+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+# llama-cpp-python (CUDA, without GGML_CUDA_FORCE_MMQ)
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.6+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.6+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.6+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.6+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # CUDA wheels
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+cu121.torch2.2.2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+cu121.torch2.2.2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+cu121.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+cu121.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
-https://github.com/oobabooga/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu122torch2.2.2cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu122torch2.2.2cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu123torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu123torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
\ No newline at end of file
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+cu121.torch2.4.1-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+cu121.torch2.4.1-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+cu121.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+cu121.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
+https://github.com/oobabooga/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu122torch2.4.1cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu122torch2.4.1cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu12torch2.4cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
diff --git a/requirements_amd.txt b/requirements_amd.txt
index 52e36510..87ee93d1 100644
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@@ -1,27 +1,26 @@
-accelerate==0.33.*
+accelerate==1.2.*
 colorama
 datasets
 einops
-gradio==4.26.*
-hqq==0.1.7.post3
-jinja2==3.1.4
-lm_eval==0.3.0
+fastapi==0.112.4
+gradio==4.37.*
+jinja2==3.1.5
 markdown
 numba==0.59.*
 numpy==1.26.*
-optimum==1.17.*
 pandas
 peft==0.12.*
 Pillow>=9.5.0
 psutil
+pydantic==2.8.2
 pyyaml
 requests
 rich
-safetensors==0.4.*
+safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.44.*
+transformers==4.48.*
 tqdm
 wandb
 
@@ -32,18 +31,14 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.89+rocm5.6.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.89+rocm5.6.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+rocm5.6.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+rocm5.6.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
-https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7+rocm561-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7+rocm561-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
\ No newline at end of file
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.3.6+rocm6.1.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.3.6+rocm6.1.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+rocm6.1.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+rocm6.1.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt
index 18a81d04..fa2f5ca7 100644
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@@ -1,27 +1,26 @@
-accelerate==0.33.*
+accelerate==1.2.*
 colorama
 datasets
 einops
-gradio==4.26.*
-hqq==0.1.7.post3
-jinja2==3.1.4
-lm_eval==0.3.0
+fastapi==0.112.4
+gradio==4.37.*
+jinja2==3.1.5
 markdown
 numba==0.59.*
 numpy==1.26.*
-optimum==1.17.*
 pandas
 peft==0.12.*
 Pillow>=9.5.0
 psutil
+pydantic==2.8.2
 pyyaml
 requests
 rich
-safetensors==0.4.*
+safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.44.*
+transformers==4.48.*
 tqdm
 wandb
 
@@ -32,16 +31,12 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # AMD wheels
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+rocm5.6.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+rocm5.6.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
-https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7+rocm561-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7+rocm561-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
\ No newline at end of file
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+rocm6.1.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+rocm6.1.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt
index af02904b..e9838295 100644
--- a/requirements_apple_intel.txt
+++ b/requirements_apple_intel.txt
@@ -1,27 +1,26 @@
-accelerate==0.33.*
+accelerate==1.2.*
 colorama
 datasets
 einops
-gradio==4.26.*
-hqq==0.1.7.post3
-jinja2==3.1.4
-lm_eval==0.3.0
+fastapi==0.112.4
+gradio==4.37.*
+jinja2==3.1.5
 markdown
 numba==0.59.*
 numpy==1.26.*
-optimum==1.17.*
 pandas
 peft==0.12.*
 Pillow>=9.5.0
 psutil
+pydantic==2.8.2
 pyyaml
 requests
 rich
-safetensors==0.4.*
+safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.44.*
+transformers==4.48.*
 tqdm
 wandb
 
@@ -32,8 +31,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.89-cp311-cp311-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.89-cp310-cp310-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.89-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.89-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8-py3-none-any.whl
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.6-cp311-cp311-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.6-cp310-cp310-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.6-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.6-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7-py3-none-any.whl
diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt
index 8cdd8519..bef02feb 100644
--- a/requirements_apple_silicon.txt
+++ b/requirements_apple_silicon.txt
@@ -1,27 +1,26 @@
-accelerate==0.33.*
+accelerate==1.2.*
 colorama
 datasets
 einops
-gradio==4.26.*
-hqq==0.1.7.post3
-jinja2==3.1.4
-lm_eval==0.3.0
+fastapi==0.112.4
+gradio==4.37.*
+jinja2==3.1.5
 markdown
 numba==0.59.*
 numpy==1.26.*
-optimum==1.17.*
 pandas
 peft==0.12.*
 Pillow>=9.5.0
 psutil
+pydantic==2.8.2
 pyyaml
 requests
 rich
-safetensors==0.4.*
+safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.44.*
+transformers==4.48.*
 tqdm
 wandb
 
@@ -32,10 +31,10 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.89-cp311-cp311-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.89-cp310-cp310-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.89-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.89-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.89-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.89-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8-py3-none-any.whl
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.6-cp311-cp311-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.6-cp310-cp310-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.6-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.6-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.6-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.6-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7-py3-none-any.whl
diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt
index 807c182a..32f1a50a 100644
--- a/requirements_cpu_only.txt
+++ b/requirements_cpu_only.txt
@@ -1,27 +1,26 @@
-accelerate==0.33.*
+accelerate==1.2.*
 colorama
 datasets
 einops
-gradio==4.26.*
-hqq==0.1.7.post3
-jinja2==3.1.4
-lm_eval==0.3.0
+fastapi==0.112.4
+gradio==4.37.*
+jinja2==3.1.5
 markdown
 numba==0.59.*
 numpy==1.26.*
-optimum==1.17.*
 pandas
 peft==0.12.*
 Pillow>=9.5.0
 psutil
+pydantic==2.8.2
 pyyaml
 requests
 rich
-safetensors==0.4.*
+safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.44.*
+transformers==4.48.*
 tqdm
 wandb
 
@@ -32,7 +31,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt
index e2a89936..938848bf 100644
--- a/requirements_cpu_only_noavx2.txt
+++ b/requirements_cpu_only_noavx2.txt
@@ -1,27 +1,26 @@
-accelerate==0.33.*
+accelerate==1.2.*
 colorama
 datasets
 einops
-gradio==4.26.*
-hqq==0.1.7.post3
-jinja2==3.1.4
-lm_eval==0.3.0
+fastapi==0.112.4
+gradio==4.37.*
+jinja2==3.1.5
 markdown
 numba==0.59.*
 numpy==1.26.*
-optimum==1.17.*
 pandas
 peft==0.12.*
 Pillow>=9.5.0
 psutil
+pydantic==2.8.2
 pyyaml
 requests
 rich
-safetensors==0.4.*
+safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.44.*
+transformers==4.48.*
 tqdm
 wandb
 
@@ -32,7 +31,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index d22eb72c..69e497e0 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -1,30 +1,27 @@
-accelerate==0.33.*
-aqlm[gpu,cpu]==1.1.6; platform_system == "Linux"
-auto-gptq==0.7.1
-bitsandbytes==0.43.*
+accelerate==1.2.*
+bitsandbytes==0.45.*
 colorama
 datasets
 einops
-gradio==4.26.*
-hqq==0.1.7.post3
-jinja2==3.1.4
-lm_eval==0.3.0
+fastapi==0.112.4
+gradio==4.37.*
+jinja2==3.1.5
 markdown
 numba==0.59.*
 numpy==1.26.*
-optimum==1.17.*
 pandas
 peft==0.12.*
 Pillow>=9.5.0
 psutil
+pydantic==2.8.2
 pyyaml
 requests
 rich
-safetensors==0.4.*
+safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.44.*
+transformers==4.48.*
 tqdm
 wandb
 
@@ -35,38 +32,30 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
-# llama-cpp-python (CUDA, no tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.89+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.89+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.89+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.89+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+# llama-cpp-python (CUDA, with GGML_CUDA_FORCE_MMQ)
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.6+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.6+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.6+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.6+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
-# llama-cpp-python (CUDA, tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.89+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.89+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.89+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.89+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+# llama-cpp-python (CUDA, without GGML_CUDA_FORCE_MMQ)
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.6+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.6+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.6+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.6+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # CUDA wheels
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+cu121.torch2.2.2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+cu121.torch2.2.2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+cu121.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+cu121.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
-https://github.com/oobabooga/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu122torch2.2.2cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu122torch2.2.2cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu123torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu123torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
\ No newline at end of file
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+cu121.torch2.4.1-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+cu121.torch2.4.1-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+cu121.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+cu121.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
+https://github.com/oobabooga/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu122torch2.4.1cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu122torch2.4.1cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu12torch2.4cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
diff --git a/requirements_nowheels.txt b/requirements_nowheels.txt
index ffb45fe3..a034ee61 100644
--- a/requirements_nowheels.txt
+++ b/requirements_nowheels.txt
@@ -1,27 +1,26 @@
-accelerate==0.33.*
+accelerate==1.2.*
 colorama
 datasets
 einops
-gradio==4.26.*
-hqq==0.1.7.post3
-jinja2==3.1.4
-lm_eval==0.3.0
+fastapi==0.112.4
+gradio==4.37.*
+jinja2==3.1.5
 markdown
 numba==0.59.*
 numpy==1.26.*
-optimum==1.17.*
 pandas
 peft==0.12.*
 Pillow>=9.5.0
 psutil
+pydantic==2.8.2
 pyyaml
 requests
 rich
-safetensors==0.4.*
+safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.44.*
+transformers==4.48.*
 tqdm
 wandb
 
diff --git a/server.py b/server.py
index d6069d5e..31e1c4c6 100644
--- a/server.py
+++ b/server.py
@@ -154,6 +154,9 @@ def create_interface():
                 if ({str(shared.settings['dark_theme']).lower()}) {{
                     document.getElementsByTagName('body')[0].classList.add('dark');
                 }}
+                else {{
+                    document.getElementsByTagName('body')[0].classList.remove('dark');
+                }}
                 {js}
                 {ui.show_controls_js}
                 toggle_controls(x);
diff --git a/settings-template.yaml b/settings-template.yaml
index 59c76c35..93a64abb 100644
--- a/settings-template.yaml
+++ b/settings-template.yaml
@@ -1,30 +1,38 @@
-dark_theme: true
 show_controls: true
 start_with: ''
 mode: chat-instruct
 chat_style: cai-chat
+chat-instruct_command: |-
+  Continue the chat dialogue below. Write a single reply for the character "<|character|>".
+
+  <|prompt|>
 prompt-default: QA
 prompt-notebook: QA
+character: Assistant
+name1: You
+user_bio: ''
+custom_system_message: ''
 preset: min_p
 max_new_tokens: 512
 max_new_tokens_min: 1
 max_new_tokens_max: 4096
-negative_prompt: ''
-seed: -1
-truncation_length: 2048
+prompt_lookup_num_tokens: 0
 max_tokens_second: 0
 max_updates_second: 0
-prompt_lookup_num_tokens: 0
-custom_stopping_strings: ''
-custom_token_bans: ''
-auto_max_new_tokens: false
+auto_max_new_tokens: true
 ban_eos_token: false
 add_bos_token: true
 skip_special_tokens: true
 stream: true
-character: Assistant
-name1: You
-custom_system_message: ''
+static_cache: false
+truncation_length: 2048
+seed: -1
+custom_stopping_strings: ''
+custom_token_bans: ''
+negative_prompt: ''
+autoload_model: false
+dark_theme: true
+default_extensions: []
 instruction_template_str: |-
   {%- set ns = namespace(found=false) -%}
   {%- for message in messages -%}
@@ -66,11 +74,4 @@ chat_template_str: |-
           {%- endif -%}
       {%- endif -%}
   {%- endfor -%}
-chat-instruct_command: |-
-  Continue the chat dialogue below. Write a single reply for the character "<|character|>".
 
-  <|prompt|>
-autoload_model: false
-gallery-items_per_page: 50
-gallery-open: false
-default_extensions: []
diff --git a/start_linux.sh b/start_linux.sh
index 792daca8..256604cb 100755
--- a/start_linux.sh
+++ b/start_linux.sh
@@ -19,7 +19,7 @@ esac
 INSTALL_DIR="$(pwd)/installer_files"
 CONDA_ROOT_PREFIX="$(pwd)/installer_files/conda"
 INSTALL_ENV_DIR="$(pwd)/installer_files/env"
-MINICONDA_DOWNLOAD_URL="https://repo.anaconda.com/miniconda/Miniconda3-py310_23.3.1-0-Linux-${OS_ARCH}.sh"
+MINICONDA_DOWNLOAD_URL="https://repo.anaconda.com/miniconda/Miniconda3-py311_24.11.1-0-Linux-${OS_ARCH}.sh"
 conda_exists="F"
 
 # figure out whether git and conda needs to be installed
diff --git a/start_macos.sh b/start_macos.sh
index 6761f531..02f1011a 100755
--- a/start_macos.sh
+++ b/start_macos.sh
@@ -19,7 +19,7 @@ esac
 INSTALL_DIR="$(pwd)/installer_files"
 CONDA_ROOT_PREFIX="$(pwd)/installer_files/conda"
 INSTALL_ENV_DIR="$(pwd)/installer_files/env"
-MINICONDA_DOWNLOAD_URL="https://repo.anaconda.com/miniconda/Miniconda3-py310_23.3.1-0-MacOSX-${OS_ARCH}.sh"
+MINICONDA_DOWNLOAD_URL="https://repo.anaconda.com/miniconda/Miniconda3-py311_24.11.1-0-MacOSX-${OS_ARCH}.sh"
 conda_exists="F"
 
 # figure out whether git and conda needs to be installed
diff --git a/start_windows.bat b/start_windows.bat
index ebcc1997..960cfdb7 100755
--- a/start_windows.bat
+++ b/start_windows.bat
@@ -25,8 +25,8 @@ set TEMP=%cd%\installer_files
 set INSTALL_DIR=%cd%\installer_files
 set CONDA_ROOT_PREFIX=%cd%\installer_files\conda
 set INSTALL_ENV_DIR=%cd%\installer_files\env
-set MINICONDA_DOWNLOAD_URL=https://repo.anaconda.com/miniconda/Miniconda3-py310_23.3.1-0-Windows-x86_64.exe
-set MINICONDA_CHECKSUM=307194e1f12bbeb52b083634e89cc67db4f7980bd542254b43d3309eaf7cb358
+set MINICONDA_DOWNLOAD_URL=https://repo.anaconda.com/miniconda/Miniconda3-py311_24.11.1-0-Windows-x86_64.exe
+set MINICONDA_CHECKSUM=43dcbcc315ff91edf959e002cd2f1ede38c64b999fefcc951bccf2ed69c9e8bb
 set conda_exists=F
 
 @rem figure out whether git and conda needs to be installed
@@ -41,10 +41,18 @@ if "%conda_exists%" == "F" (
 	mkdir "%INSTALL_DIR%"
 	call curl -Lk "%MINICONDA_DOWNLOAD_URL%" > "%INSTALL_DIR%\miniconda_installer.exe" || ( echo. && echo Miniconda failed to download. && goto end )
 
+	:: Try CertUtil first
 	for /f %%a in ('CertUtil -hashfile "%INSTALL_DIR%\miniconda_installer.exe" SHA256 ^| find /i /v " " ^| find /i "%MINICONDA_CHECKSUM%"') do (
 		set "output=%%a"
 	)
 
+	:: If CertUtil fails, try PowerShell
+	if not defined output (
+		for /f %%a in ('powershell -Command "if((Get-FileHash \"%INSTALL_DIR%\miniconda_installer.exe\" -Algorithm SHA256).Hash -eq ''%MINICONDA_CHECKSUM%''){echo true}"') do (
+			set "output=%%a"
+		)
+	)
+
 	if not defined output (
 		echo The checksum verification for miniconda_installer.exe has failed.
 		del "%INSTALL_DIR%\miniconda_installer.exe"
diff --git a/update_wizard_linux.sh b/update_wizard_linux.sh
index 3ada9a1e..c81d9d9b 100755
--- a/update_wizard_linux.sh
+++ b/update_wizard_linux.sh
@@ -23,4 +23,4 @@ source "$CONDA_ROOT_PREFIX/etc/profile.d/conda.sh" # otherwise conda complains a
 conda activate "$INSTALL_ENV_DIR"
 
 # update installer env
-python one_click.py --update-wizard && echo -e "\nDone!"
+python one_click.py --update-wizard && echo -e "\nHave a great day!"
diff --git a/update_wizard_macos.sh b/update_wizard_macos.sh
index c5add61e..f58bb9e9 100755
--- a/update_wizard_macos.sh
+++ b/update_wizard_macos.sh
@@ -23,4 +23,4 @@ source "$CONDA_ROOT_PREFIX/etc/profile.d/conda.sh" # otherwise conda complains a
 conda activate "$INSTALL_ENV_DIR"
 
 # update installer env
-python one_click.py --update-wizard && echo -e "\nDone!"
+python one_click.py --update-wizard && echo -e "\nHave a great day!"
diff --git a/update_wizard_windows.bat b/update_wizard_windows.bat
index 2b23f322..fac251a7 100755
--- a/update_wizard_windows.bat
+++ b/update_wizard_windows.bat
@@ -30,7 +30,7 @@ call "%CONDA_ROOT_PREFIX%\condabin\conda.bat" activate "%INSTALL_ENV_DIR%" || (
 @rem update installer env
 call python one_click.py --update-wizard && (
     echo.
-    echo Done!
+    echo Have a great day!
 )
 
 :end
diff --git a/wsl.sh b/wsl.sh
index 7b17132f..c5d28b16 100755
--- a/wsl.sh
+++ b/wsl.sh
@@ -26,7 +26,7 @@ fi
 INSTALL_DIR="$INSTALL_DIR_PREFIX/text-generation-webui"
 CONDA_ROOT_PREFIX="$INSTALL_DIR/installer_files/conda"
 INSTALL_ENV_DIR="$INSTALL_DIR/installer_files/env"
-MINICONDA_DOWNLOAD_URL="https://repo.anaconda.com/miniconda/Miniconda3-py310_23.3.1-0-Linux-x86_64.sh"
+MINICONDA_DOWNLOAD_URL="https://repo.anaconda.com/miniconda/Miniconda3-py311_24.11.1-0-Linux-x86_64.sh"
 conda_exists="F"
 
 # environment isolation