diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
new file mode 100644
index 00000000..51e26b13
--- /dev/null
+++ b/.github/pull_request_template.md
@@ -0,0 +1,3 @@
+## Checklist:
+
+- [ ] I have read the [Contributing guidelines](https://github.com/oobabooga/text-generation-webui/wiki/Contributing-guidelines).
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index ce603a4f..2de6d955 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -13,8 +13,8 @@ jobs:
       - uses: actions/stale@v5
         with:
           stale-issue-message: ""
-          close-issue-message: "This issue has been closed due to inactivity for 30 days. If you believe it is still relevant, please leave a comment below."
-          days-before-issue-stale: 30
+          close-issue-message: "This issue has been closed due to inactivity for 6 weeks. If you believe it is still relevant, please leave a comment below. You can tag a developer in your comment."
+          days-before-issue-stale: 42
           days-before-issue-close: 0
           stale-issue-label: "stale"
           days-before-pr-stale: -1
diff --git a/README.md b/README.md
index 073a841d..77d67fa9 100644
--- a/README.md
+++ b/README.md
@@ -1,29 +1,27 @@
 # Text generation web UI
 
-A gradio web UI for running Large Language Models like LLaMA, llama.cpp, GPT-J, OPT, and GALACTICA.
+A Gradio web UI for Large Language Models.
 
 Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) of text generation.
 
-|![Image1](https://github.com/oobabooga/screenshots/raw/main/qa.png) | ![Image2](https://github.com/oobabooga/screenshots/raw/main/cai3.png) |
+|![Image1](https://github.com/oobabooga/screenshots/raw/main/print_instruct.png) | ![Image2](https://github.com/oobabooga/screenshots/raw/main/print_chat.png) |
 |:---:|:---:|
-|![Image3](https://github.com/oobabooga/screenshots/raw/main/gpt4chan.png) | ![Image4](https://github.com/oobabooga/screenshots/raw/main/galactica.png) |
+|![Image1](https://github.com/oobabooga/screenshots/raw/main/print_default.png) | ![Image2](https://github.com/oobabooga/screenshots/raw/main/print_parameters.png) |
 
 ## Features
 
-* 3 interface modes: default, notebook, and chat
-* Multiple model backends: transformers, llama.cpp, ExLlama, AutoGPTQ, GPTQ-for-LLaMa
+* 3 interface modes: default (two columns), notebook, and chat
+* Multiple model backends: [transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp), [ExLlama](https://github.com/turboderp/exllama), [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa), [ctransformers](https://github.com/marella/ctransformers)
 * Dropdown menu for quickly switching between different models
-* LoRA: load and unload LoRAs on the fly, train a new LoRA
-* Precise instruction templates for chat mode, including Llama 2, Alpaca, Vicuna, WizardLM, StableLM, and many others
+* LoRA: load and unload LoRAs on the fly, train a new LoRA using QLoRA
+* Precise instruction templates for chat mode, including Llama-2-chat, Alpaca, Vicuna, WizardLM, StableLM, and many others
+* 4-bit, 8-bit, and CPU inference through the transformers library
+* Use llama.cpp models with transformers samplers (`llamacpp_HF` loader)
 * [Multimodal pipelines, including LLaVA and MiniGPT-4](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/multimodal)
-* 8-bit and 4-bit inference through bitsandbytes
-* CPU mode for transformers models
-* [DeepSpeed ZeRO-3 inference](docs/DeepSpeed.md)
-* [Extensions](docs/Extensions.md)
+* [Extensions framework](docs/Extensions.md)
 * [Custom chat characters](docs/Chat-mode.md)
 * Very efficient text streaming
 * Markdown output with LaTeX rendering, to use for instance with [GALACTICA](https://github.com/paperswithcode/galai)
-* Nice HTML output for GPT-4chan
 * API, including endpoints for websocket streaming ([see the examples](https://github.com/oobabooga/text-generation-webui/blob/main/api-examples))
 
 To learn how to use the various features, check out the Documentation: https://github.com/oobabooga/text-generation-webui/tree/main/docs
@@ -38,26 +36,24 @@ To learn how to use the various features, check out the Documentation: https://g
 
 Just download the zip above, extract it, and double-click on "start". The web UI and all its dependencies will be installed in the same folder.
 
-* The source codes are here: https://github.com/oobabooga/one-click-installers
+* The source codes and more information can be found here: https://github.com/oobabooga/one-click-installers
 * There is no need to run the installers as admin.
-* AMD doesn't work on Windows.
 * Huge thanks to [@jllllll](https://github.com/jllllll), [@ClayShoaf](https://github.com/ClayShoaf), and [@xNul](https://github.com/xNul) for their contributions to these installers.
 
 ### Manual installation using Conda
 
-Recommended if you have some experience with the command line.
+Recommended if you have some experience with the command-line.
 
 #### 0. Install Conda
 
 https://docs.conda.io/en/latest/miniconda.html
 
-On Linux or WSL, it can be automatically installed with these two commands:
+On Linux or WSL, it can be automatically installed with these two commands ([source](https://educe-ubc.github.io/conda.html)):
 
 ```
 curl -sL "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh" > "Miniconda3.sh"
 bash Miniconda3.sh
 ```
-Source: https://educe-ubc.github.io/conda.html
 
 #### 1. Create a new conda environment
 
@@ -79,7 +75,7 @@ conda activate textgen
 
 The up-to-date commands can be found here: https://pytorch.org/get-started/locally/. 
 
-#### 2.1 Special instructions
+#### 2.1 Additional information
 
 * MacOS users: https://github.com/oobabooga/text-generation-webui/pull/393
 * AMD users: https://rentry.org/eq3hg
@@ -92,9 +88,21 @@ cd text-generation-webui
 pip install -r requirements.txt
 ```
 
-#### bitsandbytes
+#### llama.cpp on AMD, Metal, and some specific CPUs
 
-bitsandbytes >= 0.39 may not work on older NVIDIA GPUs. In that case, to use `--load-in-8bit`, you may have to downgrade like this:
+Precompiled wheels are included for CPU-only and NVIDIA GPUs (cuBLAS). For AMD, Metal, and some specific CPUs, you need to uninstall those wheels and compile llama-cpp-python yourself.
+
+To uninstall:
+
+```
+pip uninstall -y llama-cpp-python llama-cpp-python-cuda
+```
+
+To compile: https://github.com/abetlen/llama-cpp-python#installation-with-openblas--cublas--clblast--metal
+
+#### bitsandbytes on older NVIDIA GPUs
+
+bitsandbytes >= 0.39 may not work. In that case, to use `--load-in-8bit`, you may have to downgrade like this:
 
 * Linux: `pip install bitsandbytes==0.38.1`
 * Windows: `pip install https://github.com/jllllll/bitsandbytes-windows-webui/raw/main/bitsandbytes-0.38.1-py3-none-any.whl`
@@ -113,37 +121,50 @@ docker compose up --build
 
 ### Updating the requirements
 
-From time to time, the `requirements.txt` changes. To update, use this command:
+From time to time, the `requirements.txt` changes. To update, use these commands:
 
 ```
 conda activate textgen
 cd text-generation-webui
 pip install -r requirements.txt --upgrade
 ```
+
 ## Downloading models
 
-Models should be placed inside the `models/` folder.
+Models should be placed in the `text-generation-webui/models` folder. They are usually downloaded from [Hugging Face](https://huggingface.co/models?pipeline_tag=text-generation&sort=downloads).
 
-[Hugging Face](https://huggingface.co/models?pipeline_tag=text-generation&sort=downloads) is the main place to download models. These are some examples:
+* Transformers or GPTQ models are made of several files and must be placed in a subfolder. Example:
 
-* [Pythia](https://huggingface.co/models?sort=downloads&search=eleutherai%2Fpythia+deduped)
-* [OPT](https://huggingface.co/models?search=facebook/opt)
-* [GALACTICA](https://huggingface.co/models?search=facebook/galactica)
-* [GPT-J 6B](https://huggingface.co/EleutherAI/gpt-j-6B/tree/main)
+```
+text-generation-webui
+├── models
+│   ├── lmsys_vicuna-33b-v1.3
+│   │   ├── config.json
+│   │   ├── generation_config.json
+│   │   ├── pytorch_model-00001-of-00007.bin
+│   │   ├── pytorch_model-00002-of-00007.bin
+│   │   ├── pytorch_model-00003-of-00007.bin
+│   │   ├── pytorch_model-00004-of-00007.bin
+│   │   ├── pytorch_model-00005-of-00007.bin
+│   │   ├── pytorch_model-00006-of-00007.bin
+│   │   ├── pytorch_model-00007-of-00007.bin
+│   │   ├── pytorch_model.bin.index.json
+│   │   ├── special_tokens_map.json
+│   │   ├── tokenizer_config.json
+│   │   └── tokenizer.model
+```
 
-You can automatically download a model from HF using the script `download-model.py`:
+In the "Model" tab of the UI, those models can be automatically downloaded from Hugging Face. You can also download them via the command-line with `python download-model.py organization/model`.
 
-    python download-model.py organization/model
+* GGML/GGUF models are a single file and should be placed directly into `models`. Example:
 
-For example:
+```
+text-generation-webui
+├── models
+│   ├── llama-13b.ggmlv3.q4_K_M.bin
+```
 
-    python download-model.py facebook/opt-1.3b
-
-To download a protected model, set env vars `HF_USER` and `HF_PASS` to your Hugging Face username and password (or [User Access Token](https://huggingface.co/settings/tokens)). The model's terms must first be accepted on the HF website.
-
-#### GGML models
-
-You can drop these directly into the `models/` folder, making sure that the file name contains `ggml` somewhere and ends in `.bin`.
+Those models must be downloaded manually, as they are not currently supported by the automated downloader.
 
 #### GPT-4chan
 
@@ -169,7 +190,10 @@ After downloading the model, follow these steps:
 python download-model.py EleutherAI/gpt-j-6B --text-only
 ```
 
-When you load this model in default or notebook modes, the "HTML" tab will show the generated text in 4chan format.
+When you load this model in default or notebook modes, the "HTML" tab will show the generated text in 4chan format:
+
+![Image3](https://github.com/oobabooga/screenshots/raw/main/gpt4chan.png)
+
 </details>
 
 ## Starting the web UI
@@ -189,8 +213,6 @@ Optionally, you can use the following command-line flags:
 | Flag                                       | Description |
 |--------------------------------------------|-------------|
 | `-h`, `--help`                             | Show this help message and exit. |
-| `--notebook`                               | Launch the web UI in notebook mode, where the output is written to the same text box as the input. |
-| `--chat`                                   | Launch the web UI in chat mode. |
 | `--multi-user`                             | Multi-user mode. Chat histories are not saved or automatically loaded. WARNING: this is highly experimental. |
 | `--character CHARACTER`                    | The name of the character to load in chat mode by default. |
 | `--model MODEL`                            | Name of the model to load by default. |
@@ -198,7 +220,6 @@ Optionally, you can use the following command-line flags:
 | `--model-dir MODEL_DIR`                    | Path to directory with all the models. |
 | `--lora-dir LORA_DIR`                      | Path to directory with all the loras. |
 | `--model-menu`                             | Show a model menu in the terminal when the web UI is first launched. |
-| `--no-stream`                              | Don't stream the text output in real time. |
 | `--settings SETTINGS_FILE`                 | Load the default interface settings from this yaml file. See `settings-template.yaml` for an example. If you create a file called `settings.yaml`, this file will be loaded by default without the need to use the `--settings` flag. |
 | `--extensions EXTENSIONS [EXTENSIONS ...]` | The list of extensions to load. If you want to load more than one extension, write the names separated by spaces. |
 | `--verbose`                                | Print the prompts to the terminal. |
@@ -207,7 +228,7 @@ Optionally, you can use the following command-line flags:
 
 | Flag                                       | Description |
 |--------------------------------------------|-------------|
-| `--loader LOADER`                          | Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, exllama_hf, llamacpp, rwkv |
+| `--loader LOADER`                          | Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, exllama_hf, llamacpp, rwkv, ctransformers |
 
 #### Accelerate/transformers
 
@@ -237,20 +258,35 @@ Optionally, you can use the following command-line flags:
 | `--quant_type QUANT_TYPE`                   | quant_type for 4-bit. Valid options: nf4, fp4. |
 | `--use_double_quant`                        | use_double_quant for 4-bit. |
 
-#### llama.cpp
+#### GGML/GGUF (for llama.cpp and ctransformers)
 
 | Flag        | Description |
 |-------------|-------------|
 | `--threads` | Number of threads to use. |
 | `--n_batch` | Maximum number of prompt tokens to batch together when calling llama_eval. |
-| `--no-mmap` | Prevent mmap from being used. |
-| `--mlock`   | Force the system to keep the model in RAM. |
-| `--cache-capacity CACHE_CAPACITY`   | Maximum cache capacity. Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed. |
 | `--n-gpu-layers N_GPU_LAYERS` | Number of layers to offload to the GPU. Only works if llama-cpp-python was compiled with BLAS. Set this to 1000000000 to offload all layers to the GPU. |
 | `--n_ctx N_CTX` | Size of the prompt context. |
-| `--llama_cpp_seed SEED` | Seed for llama-cpp models. Default 0 (random). |
-| `--n_gqa N_GQA`         | grouped-query attention. Must be 8 for llama2 70b. |
-| `--rms_norm_eps RMS_NORM_EPS`  | Must be 1e-5 for llama2 70b. |
+
+#### llama.cpp
+
+| Flag          | Description |
+|---------------|---------------|
+| `--no-mmap`   | Prevent mmap from being used. |
+| `--mlock`     | Force the system to keep the model in RAM. |
+| `--mul_mat_q` | Activate new mulmat kernels. |
+| `--cache-capacity CACHE_CAPACITY`   | Maximum cache capacity. Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed. |
+| `--tensor_split TENSOR_SPLIT`  | Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17 |
+| `--llama_cpp_seed SEED`        | Seed for llama-cpp models. Default 0 (random). |
+| `--n_gqa N_GQA`                | GGML only (not used by GGUF): Grouped-Query Attention. Must be 8 for llama-2 70b. |
+| `--rms_norm_eps RMS_NORM_EPS`  | GGML only (not used by GGUF): 5e-6 is a good value for llama-2 models. |
+| `--cpu`                        | Use the CPU version of llama-cpp-python instead of the GPU-accelerated version. |
+|`--cfg-cache`                   | llamacpp_HF: Create an additional cache for CFG negative prompts. |
+
+#### ctransformers
+
+| Flag        | Description |
+|-------------|-------------|
+| `--model_type MODEL_TYPE` | Model type of pre-quantized model. Currently gpt2, gptj, gptneox, falcon, llama, mpt, starcoder (gptbigcode), dollyv2, and replit are supported. |
 
 #### AutoGPTQ
 
@@ -261,6 +297,7 @@ Optionally, you can use the following command-line flags:
 | `--no_inject_fused_mlp`        | Triton mode only: disable the use of fused MLP, which will use less VRAM at the cost of slower inference. |
 | `--no_use_cuda_fp16`           | This can make models faster on some systems. |
 | `--desc_act`                   | For models that don't have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig. |
+| `--disable_exllama`            | Disable ExLlama kernel, which can improve inference speed on some systems. |
 
 #### ExLlama
 
@@ -268,6 +305,7 @@ Optionally, you can use the following command-line flags:
 |------------------|-------------|
 |`--gpu-split`     | Comma-separated list of VRAM (in GB) to use per GPU device for model layers, e.g. `20,7,7` |
 |`--max_seq_len MAX_SEQ_LEN`           | Maximum sequence length. |
+|`--cfg-cache`                         | ExLlama_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader, but not necessary for CFG with base ExLlama. |
 
 #### GPTQ-for-LLaMa
 
@@ -279,9 +317,6 @@ Optionally, you can use the following command-line flags:
 | `--pre_layer PRE_LAYER [PRE_LAYER ...]`  | The number of layers to allocate to the GPU. Setting this parameter enables CPU offloading for 4-bit models. For multi-gpu, write the numbers separated by spaces, eg `--pre_layer 30 60`. |
 | `--checkpoint CHECKPOINT` | The path to the quantized checkpoint file. If not specified, it will be automatically detected. |
 | `--monkey-patch`          | Apply the monkey patch for using LoRAs with quantized models.
-| `--quant_attn`         | (triton) Enable quant attention. |
-| `--warmup_autotune`    | (triton) Enable warmup autotune. |
-| `--fused_mlp`          | (triton) Enable fused mlp. |
 
 #### DeepSpeed
 
@@ -298,12 +333,13 @@ Optionally, you can use the following command-line flags:
 | `--rwkv-strategy RWKV_STRATEGY` | RWKV: The strategy to use while loading the model. Examples: "cpu fp32", "cuda fp16", "cuda fp16i8". |
 | `--rwkv-cuda-on`                | RWKV: Compile the CUDA kernel for better performance. |
 
-#### RoPE (for llama.cpp and ExLlama only)
+#### RoPE (for llama.cpp, ExLlama, and transformers)
 
 | Flag             | Description |
 |------------------|-------------|
-|`--compress_pos_emb COMPRESS_POS_EMB` | Positional embeddings compression factor. Should typically be set to max_seq_len / 2048. |
-|`--alpha_value ALPHA_VALUE`           | Positional embeddings alpha factor for NTK RoPE scaling. Scaling is not identical to embedding compression. Use either this or compress_pos_emb, not both. |
+| `--alpha_value ALPHA_VALUE`           | Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both. |
+| `--rope_freq_base ROPE_FREQ_BASE`     | If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63). |
+| `--compress_pos_emb COMPRESS_POS_EMB` | Positional embeddings compression factor. Should be set to (context length) / (model's original context length). Equal to 1/rope_freq_scale. |
 
 #### Gradio
 
@@ -316,6 +352,8 @@ Optionally, you can use the following command-line flags:
 | `--auto-launch`                       | Open the web UI in the default browser upon launch. |
 | `--gradio-auth USER:PWD`              | set gradio authentication like "username:password"; or comma-delimit multiple like "u1:p1,u2:p2,u3:p3" |
 | `--gradio-auth-path GRADIO_AUTH_PATH` | Set the gradio authentication file path. The file should contain one or more user:password pairs in this format: "u1:p1,u2:p2,u3:p3" |
+| `--ssl-keyfile SSL_KEYFILE`           | The path to the SSL certificate key file. |
+| `--ssl-certfile SSL_CERTFILE`         | The path to the SSL certificate cert file. |
 
 #### API
 
@@ -323,6 +361,7 @@ Optionally, you can use the following command-line flags:
 |---------------------------------------|-------------|
 | `--api`                               | Enable the API extension. |
 | `--public-api`                        | Create a public URL for the API using Cloudfare. |
+| `--public-api-id PUBLIC_API_ID`       | Tunnel ID for named Cloudflare Tunnel. Use together with public-api option. |
 | `--api-blocking-port BLOCKING_PORT`   | The listening port for the blocking API. |
 | `--api-streaming-port STREAMING_PORT` | The listening port for the streaming API. |
 
@@ -340,12 +379,13 @@ The presets that are included by default are the result of a contest that receiv
 
 ## Contributing
 
-* Pull requests, suggestions, and issue reports are welcome. 
-* Make sure to carefully [search](https://github.com/oobabooga/text-generation-webui/issues) existing issues before starting a new one.
-* If you have some experience with git, testing an open pull request and leaving a comment on whether it works as expected or not is immensely helpful.
-* A simple way to contribute, even if you are not a programmer, is to leave a 👍 on an issue or pull request that you find relevant.
+If you would like to contribute to the project, check out the [Contributing guidelines](https://github.com/oobabooga/text-generation-webui/wiki/Contributing-guidelines).
 
 ## Community
 
-* Subreddit: https://www.reddit.com/r/oobaboogazz/
+* Subreddit: https://www.reddit.com/r/oobabooga/
 * Discord: https://discord.gg/jwZCF2dPQN
+
+## Acknowledgment
+
+In August 2023, [Andreessen Horowitz](https://a16z.com/) (a16z) provided a generous grant to encourage and support my independent work on this project. I am **extremely** grateful for their trust and recognition, which will allow me to dedicate more time towards realizing the full potential of text-generation-webui.
diff --git a/api-examples/api-example-chat-stream.py b/api-examples/api-example-chat-stream.py
index 14f6f9d6..cccd5b26 100644
--- a/api-examples/api-example-chat-stream.py
+++ b/api-examples/api-example-chat-stream.py
@@ -20,18 +20,23 @@ async def run(user_input, history):
     request = {
         'user_input': user_input,
         'max_new_tokens': 250,
+        'auto_max_new_tokens': False,
         'history': history,
         'mode': 'instruct',  # Valid options: 'chat', 'chat-instruct', 'instruct'
         'character': 'Example',
         'instruction_template': 'Vicuna-v1.1',  # Will get autodetected if unset
-        # 'context_instruct': '',  # Optional
         'your_name': 'You',
-
+        # 'name1': 'name of user', # Optional
+        # 'name2': 'name of character', # Optional
+        # 'context': 'character context', # Optional
+        # 'greeting': 'greeting', # Optional
+        # 'name1_instruct': 'You', # Optional
+        # 'name2_instruct': 'Assistant', # Optional
+        # 'context_instruct': 'context_instruct', # Optional
+        # 'turn_template': 'turn_template', # Optional
         'regenerate': False,
         '_continue': False,
-        'stop_at_newline': False,
-        'chat_generation_attempts': 1,
-        'chat-instruct_command': 'Continue the chat dialogue below. Write a single reply for the character "<|character|>".\n\n<|prompt|>',
+        'chat_instruct_command': 'Continue the chat dialogue below. Write a single reply for the character "<|character|>".\n\n<|prompt|>',
 
         # Generation params. If 'preset' is set to different than 'None', the values
         # in presets/preset-name.yaml are used instead of the individual numbers.
@@ -56,6 +61,8 @@ async def run(user_input, history):
         'mirostat_mode': 0,
         'mirostat_tau': 5,
         'mirostat_eta': 0.1,
+        'guidance_scale': 1,
+        'negative_prompt': '',
 
         'seed': -1,
         'add_bos_token': True,
diff --git a/api-examples/api-example-chat.py b/api-examples/api-example-chat.py
index 0e155c63..c197a584 100644
--- a/api-examples/api-example-chat.py
+++ b/api-examples/api-example-chat.py
@@ -14,18 +14,23 @@ def run(user_input, history):
     request = {
         'user_input': user_input,
         'max_new_tokens': 250,
+        'auto_max_new_tokens': False,
         'history': history,
         'mode': 'instruct',  # Valid options: 'chat', 'chat-instruct', 'instruct'
         'character': 'Example',
         'instruction_template': 'Vicuna-v1.1',  # Will get autodetected if unset
-        # 'context_instruct': '',  # Optional
         'your_name': 'You',
-
+        # 'name1': 'name of user', # Optional
+        # 'name2': 'name of character', # Optional
+        # 'context': 'character context', # Optional
+        # 'greeting': 'greeting', # Optional
+        # 'name1_instruct': 'You', # Optional
+        # 'name2_instruct': 'Assistant', # Optional
+        # 'context_instruct': 'context_instruct', # Optional
+        # 'turn_template': 'turn_template', # Optional
         'regenerate': False,
         '_continue': False,
-        'stop_at_newline': False,
-        'chat_generation_attempts': 1,
-        'chat-instruct_command': 'Continue the chat dialogue below. Write a single reply for the character "<|character|>".\n\n<|prompt|>',
+        'chat_instruct_command': 'Continue the chat dialogue below. Write a single reply for the character "<|character|>".\n\n<|prompt|>',
 
         # Generation params. If 'preset' is set to different than 'None', the values
         # in presets/preset-name.yaml are used instead of the individual numbers.
@@ -50,6 +55,8 @@ def run(user_input, history):
         'mirostat_mode': 0,
         'mirostat_tau': 5,
         'mirostat_eta': 0.1,
+        'guidance_scale': 1,
+        'negative_prompt': '',
 
         'seed': -1,
         'add_bos_token': True,
diff --git a/api-examples/api-example-stream.py b/api-examples/api-example-stream.py
index 1ae5a91c..bf5eabac 100644
--- a/api-examples/api-example-stream.py
+++ b/api-examples/api-example-stream.py
@@ -20,6 +20,7 @@ async def run(context):
     request = {
         'prompt': context,
         'max_new_tokens': 250,
+        'auto_max_new_tokens': False,
 
         # Generation params. If 'preset' is set to different than 'None', the values
         # in presets/preset-name.yaml are used instead of the individual numbers.
@@ -44,6 +45,8 @@ async def run(context):
         'mirostat_mode': 0,
         'mirostat_tau': 5,
         'mirostat_eta': 0.1,
+        'guidance_scale': 1,
+        'negative_prompt': '',
 
         'seed': -1,
         'add_bos_token': True,
diff --git a/api-examples/api-example.py b/api-examples/api-example.py
index 4e45de9e..16029807 100644
--- a/api-examples/api-example.py
+++ b/api-examples/api-example.py
@@ -12,6 +12,7 @@ def run(prompt):
     request = {
         'prompt': prompt,
         'max_new_tokens': 250,
+        'auto_max_new_tokens': False,
 
         # Generation params. If 'preset' is set to different than 'None', the values
         # in presets/preset-name.yaml are used instead of the individual numbers.
@@ -36,6 +37,8 @@ def run(prompt):
         'mirostat_mode': 0,
         'mirostat_tau': 5,
         'mirostat_eta': 0.1,
+        'guidance_scale': 1,
+        'negative_prompt': '',
 
         'seed': -1,
         'add_bos_token': True,
diff --git a/characters/instruction-following/WizardLM.yaml b/characters/instruction-following/WizardLM.yaml
deleted file mode 100644
index c65bb8f4..00000000
--- a/characters/instruction-following/WizardLM.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-user: ""
-bot: "### Response:"
-turn_template: "<|user-message|>\n\n<|bot|><|bot-message|>\n\n</s>"
-context: ""
\ No newline at end of file
diff --git a/css/chat.css b/css/chat.css
deleted file mode 100644
index 17b8d142..00000000
--- a/css/chat.css
+++ /dev/null
@@ -1,126 +0,0 @@
-.h-\[40vh\], .wrap.svelte-byatnx.svelte-byatnx.svelte-byatnx {
-    height: 66.67vh
-}
-
-.gradio-container {
-    margin-left: auto !important;
-    margin-right: auto !important;
-}
-
-.w-screen {
-    width: unset
-}
-
-div.svelte-362y77>*, div.svelte-362y77>.form>* {
-    flex-wrap: nowrap
-}
-
-/* fixes the API documentation in chat mode */
-.api-docs.svelte-1iguv9h.svelte-1iguv9h.svelte-1iguv9h {
-    display: grid;
-}
-
-.pending.svelte-1ed2p3z {
-    opacity: 1;
-}
-
-#extensions {
-    padding: 0;
-    padding: 0;
-}
-
-#gradio-chatbot {
-    height: 66.67vh;
-}
-
-.wrap.svelte-6roggh.svelte-6roggh {
-    max-height: 92.5%;
-}
-
-/* This is for the microphone button in the whisper extension */
-.sm.svelte-1ipelgc {
-    width: 100%;
-}
-
-#main button {
-    min-width: 0 !important;
-}
-
-/*****************************************************/
-/*************** Chat box declarations ***************/
-/*****************************************************/
-
-.chat {
-    margin-left: auto;
-    margin-right: auto;
-    max-width: 800px;
-    height: calc(100vh - 286px);
-    overflow-y: auto;
-    padding-right: 20px;
-    display: flex;
-    flex-direction: column-reverse;
-    word-break: break-word;
-    overflow-wrap: anywhere;
-    padding-top: 1px;
-}
-
-.message-body li {
-    margin-top: 0.5em !important;
-    margin-bottom: 0.5em !important;
-}
-
-.message-body li > p {
-    display: inline !important;
-}
-
-.message-body ul, .message-body ol {
-    font-size: 15px !important;
-}
-
-.message-body ul {
-    list-style-type: disc !important;
-}
-
-.message-body pre {
-    margin-bottom: 1.25em !important;
-}
-
-.message-body code {
-    white-space: pre-wrap !important;
-    word-wrap: break-word !important;
-}
-
-.message-body :not(pre) > code {
-    white-space: normal !important;
-}
-
-@media print {
-    body {
-        visibility: hidden;
-    }
-
-    .chat {
-        visibility: visible;
-        position: absolute;
-        left: 0;
-        top: 0;
-        max-width: none;
-        max-height: none;
-        width: 100%;
-        height: fit-content;
-        display: flex;
-        flex-direction: column-reverse;
-    }
-    
-    .message {
-        break-inside: avoid;
-    }
-    
-    .gradio-container {
-        overflow: visible;
-    }
-    
-    .tab-nav {
-        display: none !important;
-    }
-}
diff --git a/css/chat.js b/css/chat.js
deleted file mode 100644
index e304f125..00000000
--- a/css/chat.js
+++ /dev/null
@@ -1,4 +0,0 @@
-document.getElementById("main").childNodes[0].style = "max-width: 800px; margin-left: auto; margin-right: auto";
-document.getElementById("extensions").style.setProperty("max-width", "800px");
-document.getElementById("extensions").style.setProperty("margin-left", "auto");
-document.getElementById("extensions").style.setProperty("margin-right", "auto");
diff --git a/css/chat_style-TheEncrypted777.css b/css/chat_style-TheEncrypted777.css
index 7682011d..d92e982d 100644
--- a/css/chat_style-TheEncrypted777.css
+++ b/css/chat_style-TheEncrypted777.css
@@ -10,17 +10,10 @@
     line-height: 1.428571429;
 }
 
-.circle-you {
-    background-color: gray;
-    border-radius: 1rem;
-    /*Change color to any you like to be the border of your image*/
-    border: 2px solid white;
-}
-
+.circle-you,
 .circle-bot {
     background-color: gray;
     border-radius: 1rem;
-    /*Change color to any you like to be the border of the bot's image*/
     border: 2px solid white;
 }
 
@@ -105,3 +98,39 @@
 .message-body p em {
     color: rgb(110, 110, 110) !important;
 }
+
+@media screen and (max-width: 688px) {
+    .message {
+        display: grid;
+        grid-template-columns: 60px minmax(0, 1fr);
+        padding-bottom: 25px;
+        font-size: 15px;
+        font-family: Helvetica, Arial, sans-serif;
+        line-height: 1.428571429;
+    }
+
+    .circle-you, .circle-bot {
+        width: 50px;
+        height: 73px;
+        border-radius: 0.5rem;
+    }
+
+    .circle-bot img,
+    .circle-you img {
+        width: 100%;
+        height: 100%;
+        object-fit: cover;
+    }
+
+    .text {
+        padding-left: 0px;
+    }
+
+    .message-body p {
+        font-size: 16px !important;
+    }
+
+    .username {
+        font-size: 20px;
+    }
+}
diff --git a/css/html_4chan_style.css b/css/html_4chan_style.css
index 99ac6845..cef9f6eb 100644
--- a/css/html_4chan_style.css
+++ b/css/html_4chan_style.css
@@ -98,7 +98,7 @@
     margin-right: 40px !important;
 }
 
-#parent #container .message {
+#parent #container .message_4chan {
     color: black;
     border: none;
 }
\ No newline at end of file
diff --git a/css/html_instruct_style.css b/css/html_instruct_style.css
index 575281b1..160aa01c 100644
--- a/css/html_instruct_style.css
+++ b/css/html_instruct_style.css
@@ -43,6 +43,10 @@
     margin-bottom: 9px !important;
 }
 
+.gradio-container .chat .assistant-message:last-child, .gradio-container .chat .user-message:last-child {
+    margin-bottom: 0px !important;
+}
+
 .dark .chat .assistant-message {
     background-color: #3741519e;
     border: 1px solid #4b5563;
@@ -58,5 +62,5 @@ code {
 }
 
 .dark code {
-    background-color: #1a212f !important;
+    background-color: #0e1321 !important;
 }
\ No newline at end of file
diff --git a/css/html_readable_style.css b/css/html_readable_style.css
index cd5fca97..2cfa6f2b 100644
--- a/css/html_readable_style.css
+++ b/css/html_readable_style.css
@@ -26,4 +26,8 @@
 
 .container :not(pre) > code {
     white-space: normal !important;
+}
+
+.container .hoverable {
+    font-size: 14px;
 }
\ No newline at end of file
diff --git a/css/main.css b/css/main.css
index 5c17a179..405b57e0 100644
--- a/css/main.css
+++ b/css/main.css
@@ -7,6 +7,7 @@
 }
 
 .small-button {
+    min-width: 0 !important;
     max-width: 171px;
     height: 39.594px;
     align-self: end;
@@ -26,6 +27,10 @@
     max-width: 2.2em;
 }
 
+.button_nowrap {
+    white-space: nowrap;
+}
+
 #slim-column {
     flex: none !important;
     min-width: 0 !important;
@@ -41,9 +46,6 @@
     min-height: 0
 }
 
-#accordion {
-}
-
 .dark svg {
     fill: white;
 }
@@ -56,7 +58,7 @@ ol li p, ul li p {
     display: inline-block;
 }
 
-#main, #parameters, #chat-settings, #lora, #training-tab, #model-tab, #session-tab {
+#chat-tab, #default-tab, #notebook-tab, #parameters, #chat-settings, #lora, #training-tab, #model-tab, #session-tab {
     border: 0;
 }
 
@@ -70,7 +72,6 @@ ol li p, ul li p {
 }
 
 #extensions {
-    padding: 15px;
     margin-bottom: 35px;
 }
 
@@ -90,6 +91,8 @@ div.svelte-15lo0d8 > *, div.svelte-15lo0d8 > .form > * {
 .header_bar {
     background-color: #f7f7f7;
     margin-bottom: 20px;
+    display: inline !important;
+    overflow-x: scroll;
 }
 
 .dark .header_bar {
@@ -97,19 +100,36 @@ div.svelte-15lo0d8 > *, div.svelte-15lo0d8 > .form > * {
     background-color: #8080802b;
 }
 
+.header_bar button.selected {
+    border-radius: 0;
+}
+
 .textbox_default textarea {
-    height: calc(100vh - 380px);
+    height: calc(100dvh - 280px);
 }
 
 .textbox_default_output textarea {
-    height: calc(100vh - 190px);
+    height: calc(100dvh - 190px);
 }
 
 .textbox textarea {
-    height: calc(100vh - 241px);
+    height: calc(100dvh - 241px);
 }
 
-.textbox_default textarea, .textbox_default_output textarea, .textbox textarea {
+.textbox_logits textarea {
+    height: calc(100dvh - 241px);
+}
+
+.textbox_logits_notebook textarea {
+    height: calc(100dvh - 292px);
+}
+
+.textbox_default textarea,
+.textbox_default_output textarea,
+.textbox_logits textarea,
+.textbox_logits_notebook textarea,
+.textbox textarea
+{
     font-size: 16px !important;
     color: #46464A !important;
 }
@@ -118,6 +138,16 @@ div.svelte-15lo0d8 > *, div.svelte-15lo0d8 > .form > * {
     color: #efefef !important;
 }
 
+@media screen and (max-width: 711px) {
+    .textbox_default textarea {
+        height: calc(100dvh - 271px);
+    }
+
+    div .default-token-counter {
+        top: calc( 0.5 * (100dvh - 245px) ) !important;
+    }
+}
+
 /* Hide the gradio footer*/
 footer {
     display: none !important;
@@ -157,7 +187,7 @@ button {
 }
 
 .pretty_scrollbar::-webkit-scrollbar {
-  width: 10px;
+  width: 5px;
 }
 
 .pretty_scrollbar::-webkit-scrollbar-track {
@@ -167,13 +197,11 @@ button {
 .pretty_scrollbar::-webkit-scrollbar-thumb,
 .pretty_scrollbar::-webkit-scrollbar-thumb:hover {
   background: #c5c5d2;
-  border-radius: 10px;
 }
 
 .dark .pretty_scrollbar::-webkit-scrollbar-thumb,
 .dark .pretty_scrollbar::-webkit-scrollbar-thumb:hover {
   background: #374151;
-  border-radius: 10px;
 }
 
 .pretty_scrollbar::-webkit-resizer {
@@ -183,3 +211,207 @@ button {
 .dark .pretty_scrollbar::-webkit-resizer {
   background: #374151;
 }
+
+audio {
+  max-width: 100%;
+}
+
+/* Copied from https://github.com/AUTOMATIC1111/stable-diffusion-webui */
+.token-counter {
+  position: absolute !important;
+  top: calc( 0.5 * (100dvh - 215px) ) !important;
+  right: 2px;
+  z-index: 100;
+  background: var(--input-background-fill) !important;
+  min-height: 0 !important;
+}
+
+.default-token-counter {
+  top: calc( 0.5 * (100dvh - 255px) ) !important;
+}
+
+.token-counter span {
+  padding: 1px;
+  box-shadow: 0 0 0 0.3em rgba(192,192,192,0.15), inset 0 0 0.6em rgba(192,192,192,0.075);
+  border: 2px solid rgba(192,192,192,0.4) !important;
+  border-radius: 0.4em;
+}
+
+.no-background {
+  background: var(--background-fill-primary) !important;
+  padding: 0px !important;
+}
+
+/*****************************************************/
+/*************** Chat UI declarations ****************/
+/*****************************************************/
+
+.h-\[40vh\], .wrap.svelte-byatnx.svelte-byatnx.svelte-byatnx {
+    height: 66.67vh
+}
+
+.gradio-container {
+    margin-left: auto !important;
+    margin-right: auto !important;
+}
+
+.w-screen {
+    width: unset
+}
+
+div.svelte-362y77>*, div.svelte-362y77>.form>* {
+    flex-wrap: nowrap
+}
+
+.pending.svelte-1ed2p3z {
+    opacity: 1;
+}
+
+.wrap.svelte-6roggh.svelte-6roggh {
+    max-height: 92.5%;
+}
+
+/* This is for the microphone button in the whisper extension */
+.sm.svelte-1ipelgc {
+    width: 100%;
+}
+
+#chat-tab button, #notebook-tab button, #default-tab button {
+    min-width: 0 !important;
+}
+
+#chat-tab > :first-child, #extensions {
+    max-width: 800px;
+    margin-left: auto;
+    margin-right: auto;
+}
+
+@media screen and (max-width: 688px) {
+    #chat-tab {
+        padding: 0px;
+    }
+
+    #chat {
+        height: calc(100dvh - 262px) !important;
+    }
+
+    .bigchat #chat {
+        height: calc(100dvh - 180px) !important;
+    }
+
+    .chat {
+        flex-direction: column-reverse !important;
+    }
+}
+
+.chat {
+    margin-left: auto;
+    margin-right: auto;
+    max-width: 800px;
+    height: 100%;
+    overflow-y: auto;
+    padding-right: 15px;
+    display: flex;
+    flex-direction: column;
+    word-break: break-word;
+    overflow-wrap: anywhere;
+    padding-top: 6px;
+}
+
+#chat {
+    height: calc(100dvh - 272px);
+}
+
+.bigchat #chat {
+    height: calc(100dvh - 200px);
+}
+
+#show-controls {
+    position: absolute;
+    background-color: transparent;
+    left: calc(100% - 130px);
+}
+
+.chat > .messages {
+    display: flex;
+    flex-direction: column;
+}
+
+.chat .message:last-child {
+    margin-bottom: 0px !important;
+    padding-bottom: 0px !important;
+}
+
+.message-body li {
+    margin-top: 0.5em !important;
+    margin-bottom: 0.5em !important;
+}
+
+.message-body li > p {
+    display: inline !important;
+}
+
+.message-body ul, .message-body ol {
+    font-size: 15px !important;
+}
+
+.message-body ul {
+    list-style-type: disc !important;
+}
+
+.message-body pre {
+    margin-bottom: 1.25em !important;
+}
+
+.message-body code {
+    white-space: pre-wrap !important;
+    word-wrap: break-word !important;
+}
+
+.message-body :not(pre) > code {
+    white-space: normal !important;
+}
+
+#chat-input {
+    padding: 0;
+    padding-top: 18px;
+    background: var(--background-fill-primary);
+    border: none;
+}
+
+#chat-input textarea:focus {
+    box-shadow: none !important;
+}
+
+@media print {
+    body {
+        visibility: hidden;
+    }
+
+    .chat {
+        visibility: visible;
+        position: absolute;
+        left: 0;
+        top: 0;
+        max-width: unset;
+        max-height: unset;
+        width: 100%;
+        overflow-y: visible;
+    }
+    
+    .message {
+        break-inside: avoid;
+    }
+    
+    .gradio-container {
+        overflow: visible;
+    }
+    
+    .tab-nav {
+        display: none !important;
+    }
+    
+    #chat-tab > :first-child {
+        max-width: unset;
+    }
+}
diff --git a/css/main.js b/css/main.js
deleted file mode 100644
index f3b3c05f..00000000
--- a/css/main.js
+++ /dev/null
@@ -1,25 +0,0 @@
-document.getElementById("main").parentNode.childNodes[0].classList.add("header_bar");
-document.getElementById("main").parentNode.style = "padding: 0; margin: 0";
-document.getElementById("main").parentNode.parentNode.parentNode.style = "padding: 0";
-
-// Get references to the elements
-let main = document.getElementById('main');
-let main_parent = main.parentNode;
-let extensions = document.getElementById('extensions');
-
-// Add an event listener to the main element
-main_parent.addEventListener('click', function(e) {
-    // Check if the main element is visible
-    if (main.offsetHeight > 0 && main.offsetWidth > 0) {
-        extensions.style.display = 'flex';
-    } else {
-        extensions.style.display = 'none';
-    }
-});
-
-const textareaElements = document.querySelectorAll('.add_scrollbar textarea');
-for(i = 0; i < textareaElements.length; i++) {
-    textareaElements[i].classList.remove('scroll-hide');
-    textareaElements[i].classList.add('pretty_scrollbar');
-    textareaElements[i].style.resize = "none";
-}
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 7cc0ff15..ded0b6c2 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -16,7 +16,7 @@ RUN . /build/venv/bin/activate && \
 
 # https://developer.nvidia.com/cuda-gpus
 # for a rtx 2060: ARG TORCH_CUDA_ARCH_LIST="7.5"
-ARG TORCH_CUDA_ARCH_LIST="3.5;5.0;6.0;6.1;7.0;7.5;8.0;8.6+PTX"
+ARG TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-3.5;5.0;6.0;6.1;7.0;7.5;8.0;8.6+PTX}"
 RUN . /build/venv/bin/activate && \
     python3 setup_cuda.py bdist_wheel -d .
 
@@ -26,7 +26,7 @@ LABEL maintainer="Your Name <your.email@example.com>"
 LABEL description="Docker image for GPTQ-for-LLaMa and Text Generation WebUI"
 
 RUN apt-get update && \
-    apt-get install --no-install-recommends -y python3-dev libportaudio2 libasound-dev git python3 python3-pip make g++ && \
+    apt-get install --no-install-recommends -y python3-dev libportaudio2 libasound-dev git python3 python3-pip make g++ ffmpeg && \
     rm -rf /var/lib/apt/lists/*
 
 RUN --mount=type=cache,target=/root/.cache/pip pip3 install virtualenv
@@ -51,11 +51,15 @@ COPY extensions/elevenlabs_tts/requirements.txt /app/extensions/elevenlabs_tts/r
 COPY extensions/google_translate/requirements.txt /app/extensions/google_translate/requirements.txt
 COPY extensions/silero_tts/requirements.txt /app/extensions/silero_tts/requirements.txt
 COPY extensions/whisper_stt/requirements.txt /app/extensions/whisper_stt/requirements.txt
+COPY extensions/superbooga/requirements.txt /app/extensions/superbooga/requirements.txt
+COPY extensions/openai/requirements.txt /app/extensions/openai/requirements.txt
 RUN --mount=type=cache,target=/root/.cache/pip . /app/venv/bin/activate && cd extensions/api && pip3 install -r requirements.txt
 RUN --mount=type=cache,target=/root/.cache/pip . /app/venv/bin/activate && cd extensions/elevenlabs_tts && pip3 install -r requirements.txt
 RUN --mount=type=cache,target=/root/.cache/pip . /app/venv/bin/activate && cd extensions/google_translate && pip3 install -r requirements.txt
 RUN --mount=type=cache,target=/root/.cache/pip . /app/venv/bin/activate && cd extensions/silero_tts && pip3 install -r requirements.txt
 RUN --mount=type=cache,target=/root/.cache/pip . /app/venv/bin/activate && cd extensions/whisper_stt && pip3 install -r requirements.txt
+RUN --mount=type=cache,target=/root/.cache/pip . /app/venv/bin/activate && cd extensions/superbooga && pip3 install -r requirements.txt
+RUN --mount=type=cache,target=/root/.cache/pip . /app/venv/bin/activate && cd extensions/openai && pip3 install -r requirements.txt
 
 COPY requirements.txt /app/requirements.txt
 RUN . /app/venv/bin/activate && \
diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
index 46b27580..ce29f33b 100644
--- a/docker/docker-compose.yml
+++ b/docker/docker-compose.yml
@@ -23,6 +23,7 @@ services:
       - ./prompts:/app/prompts
       - ./softprompts:/app/softprompts
       - ./training:/app/training
+      - ./cloudflared:/etc/cloudflared
     deploy:
       resources:
         reservations:
diff --git a/docs/Extensions.md b/docs/Extensions.md
index 4e59e855..53acce59 100644
--- a/docs/Extensions.md
+++ b/docs/Extensions.md
@@ -39,8 +39,8 @@ The extensions framework is based on special functions and variables that you ca
 | `def ui()` | Creates custom gradio elements when the UI is launched. | 
 | `def custom_css()` | Returns custom CSS as a string. It is applied whenever the web UI is loaded. |
 | `def custom_js()` | Same as above but for javascript. |
-| `def input_modifier(string, state)`  | Modifies the input string before it enters the model. In chat mode, it is applied to the user message. Otherwise, it is applied to the entire prompt. |
-| `def output_modifier(string, state)`  | Modifies the output string before it is presented in the UI. In chat mode, it is applied to the bot's reply. Otherwise, it is applied to the entire output. |
+| `def input_modifier(string, state, is_chat=False)`  | Modifies the input string before it enters the model. In chat mode, it is applied to the user message. Otherwise, it is applied to the entire prompt. |
+| `def output_modifier(string, state, is_chat=False)`  | Modifies the output string before it is presented in the UI. In chat mode, it is applied to the bot's reply. Otherwise, it is applied to the entire output. |
 | `def chat_input_modifier(text, visible_text, state)` | Modifies both the visible and internal inputs in chat mode. Can be used to hijack the chat input with custom content. |
 | `def bot_prefix_modifier(string, state)`  | Applied in chat mode to the prefix for the bot's reply. |
 | `def state_modifier(state)`  | Modifies the dictionary containing the UI input parameters before it is used by the text generation functions. |
@@ -163,7 +163,7 @@ def chat_input_modifier(text, visible_text, state):
     """
     return text, visible_text
 
-def input_modifier(string, state):
+def input_modifier(string, state, is_chat=False):
     """
     In default/notebook modes, modifies the whole prompt.
 
@@ -196,7 +196,7 @@ def logits_processor_modifier(processor_list, input_ids):
     processor_list.append(MyLogits())
     return processor_list
 
-def output_modifier(string, state):
+def output_modifier(string, state, is_chat=False):
     """
     Modifies the LLM output before it gets presented.
 
diff --git a/docs/GPTQ-models-(4-bit-mode).md b/docs/GPTQ-models-(4-bit-mode).md
index 838595ef..428d7560 100644
--- a/docs/GPTQ-models-(4-bit-mode).md
+++ b/docs/GPTQ-models-(4-bit-mode).md
@@ -64,59 +64,19 @@ python server.py --autogptq --gpu-memory 3000MiB 6000MiB --model model_name
 
 ### Using LoRAs with AutoGPTQ
 
-Not supported yet.
+Works fine for a single LoRA.
 
 ## GPTQ-for-LLaMa
 
 GPTQ-for-LLaMa is the original adaptation of GPTQ for the LLaMA model. It was made possible by [@qwopqwop200](https://github.com/qwopqwop200/GPTQ-for-LLaMa): https://github.com/qwopqwop200/GPTQ-for-LLaMa
 
-Different branches of GPTQ-for-LLaMa are currently available, including:
-
-| Branch | Comment |
-|----|----|
-| [Old CUDA branch (recommended)](https://github.com/oobabooga/GPTQ-for-LLaMa/) | The fastest branch, works on Windows and Linux. |
-| [Up-to-date triton branch](https://github.com/qwopqwop200/GPTQ-for-LLaMa) | Slightly more precise than the old CUDA branch from 13b upwards, significantly more precise for 7b. 2x slower for small context size and only works on Linux. |
-| [Up-to-date CUDA branch](https://github.com/qwopqwop200/GPTQ-for-LLaMa/tree/cuda) | As precise as the up-to-date triton branch, 10x slower than the old cuda branch for small context size. |
-
-Overall, I recommend using the old CUDA branch. It is included by default in the one-click-installer for this web UI.
-
-### Installation
-
-Start by cloning GPTQ-for-LLaMa into your `text-generation-webui/repositories` folder:
-
-```
-mkdir repositories
-cd repositories
-git clone https://github.com/oobabooga/GPTQ-for-LLaMa.git -b cuda
-```
-
-If you want to you to use the up-to-date CUDA or triton branches instead of the old CUDA branch, use these commands:
-
-```
-git clone https://github.com/qwopqwop200/GPTQ-for-LLaMa.git -b cuda
-```
-
-```
-git clone https://github.com/qwopqwop200/GPTQ-for-LLaMa.git -b triton
-```
-
-Next you need to install the CUDA extensions. You can do that either by installing the precompiled wheels, or by compiling the wheels yourself.
+A Python package containing both major CUDA versions of GPTQ-for-LLaMa is used to simplify installation and compatibility: https://github.com/jllllll/GPTQ-for-LLaMa-CUDA
 
 ### Precompiled wheels
 
-Kindly provided by our friend jllllll: https://github.com/jllllll/GPTQ-for-LLaMa-Wheels
+Kindly provided by our friend jllllll: https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases
 
-Windows:
-
-```
-pip install https://github.com/jllllll/GPTQ-for-LLaMa-Wheels/raw/main/quant_cuda-0.0.0-cp310-cp310-win_amd64.whl
-```
-
-Linux:
-
-```
-pip install https://github.com/jllllll/GPTQ-for-LLaMa-Wheels/raw/Linux-x64/quant_cuda-0.0.0-cp310-cp310-linux_x86_64.whl
-```
+Wheels are included in requirements.txt and are installed with the webui on supported systems.
 
 ### Manual installation
 
@@ -124,20 +84,19 @@ pip install https://github.com/jllllll/GPTQ-for-LLaMa-Wheels/raw/Linux-x64/quant
 
 ```
 conda activate textgen
-conda install -c conda-forge cudatoolkit-dev
+conda install cuda -c nvidia/label/cuda-11.7.1
 ```
 
 The command above takes some 10 minutes to run and shows no progress bar or updates along the way.
 
-You are also going to need to have a C++ compiler installed. On Linux, `sudo apt install build-essential` or equivalent is enough.
+You are also going to need to have a C++ compiler installed. On Linux, `sudo apt install build-essential` or equivalent is enough. On Windows, Visual Studio or Visual Studio Build Tools is required.
 
-If you're using an older version of CUDA toolkit (e.g. 11.7) but the latest version of `gcc` and `g++` (12.0+), you should downgrade with: `conda install -c conda-forge gxx==11.3.0`. Kernel compilation will fail otherwise.
+If you're using an older version of CUDA toolkit (e.g. 11.7) but the latest version of `gcc` and `g++` (12.0+) on Linux, you should downgrade with: `conda install -c conda-forge gxx==11.3.0`. Kernel compilation will fail otherwise.
 
 #### Step 2: compile the CUDA extensions
 
 ```
-cd repositories/GPTQ-for-LLaMa
-python setup_cuda.py install
+python -m pip install git+https://github.com/jllllll/GPTQ-for-LLaMa-CUDA -v
 ```
 
 ### Getting pre-converted LLaMA weights
diff --git a/download-model.py b/download-model.py
index 0f650516..b36865d7 100644
--- a/download-model.py
+++ b/download-model.py
@@ -24,14 +24,14 @@ from tqdm.contrib.concurrent import thread_map
 
 class ModelDownloader:
     def __init__(self, max_retries=5):
-        self.s = requests.Session()
+        self.session = requests.Session()
         if max_retries:
-            self.s.mount('https://cdn-lfs.huggingface.co', HTTPAdapter(max_retries=max_retries))
-            self.s.mount('https://huggingface.co', HTTPAdapter(max_retries=max_retries))
+            self.session.mount('https://cdn-lfs.huggingface.co', HTTPAdapter(max_retries=max_retries))
+            self.session.mount('https://huggingface.co', HTTPAdapter(max_retries=max_retries))
         if os.getenv('HF_USER') is not None and os.getenv('HF_PASS') is not None:
-            self.s.auth = (os.getenv('HF_USER'), os.getenv('HF_PASS'))
+            self.session.auth = (os.getenv('HF_USER'), os.getenv('HF_PASS'))
         if os.getenv('HF_TOKEN') is not None:
-            self.s.headers = {'authorization': f'Bearer {os.getenv("HF_TOKEN")}'}
+            self.session.headers = {'authorization': f'Bearer {os.getenv("HF_TOKEN")}'}
 
     def sanitize_model_and_branch_names(self, model, branch):
         if model[-1] == '/':
@@ -57,12 +57,13 @@ class ModelDownloader:
         classifications = []
         has_pytorch = False
         has_pt = False
-        # has_ggml = False
+        has_gguf = False
+        has_ggml = False
         has_safetensors = False
         is_lora = False
         while True:
             url = f"{base}{page}" + (f"?cursor={cursor.decode()}" if cursor else "")
-            r = self.s.get(url, timeout=10)
+            r = self.session.get(url, timeout=10)
             r.raise_for_status()
             content = r.content
 
@@ -75,13 +76,14 @@ class ModelDownloader:
                 if not is_lora and fname.endswith(('adapter_config.json', 'adapter_model.bin')):
                     is_lora = True
 
-                is_pytorch = re.match("(pytorch|adapter|gptq)_model.*\.bin", fname)
-                is_safetensors = re.match(".*\.safetensors", fname)
-                is_pt = re.match(".*\.pt", fname)
-                is_ggml = re.match(".*ggml.*\.bin", fname)
-                is_tokenizer = re.match("(tokenizer|ice|spiece).*\.model", fname)
-                is_text = re.match(".*\.(txt|json|py|md)", fname) or is_tokenizer
-                if any((is_pytorch, is_safetensors, is_pt, is_ggml, is_tokenizer, is_text)):
+                is_pytorch = re.match(r"(pytorch|adapter|gptq)_model.*\.bin", fname)
+                is_safetensors = re.match(r".*\.safetensors", fname)
+                is_pt = re.match(r".*\.pt", fname)
+                is_gguf = re.match(r'.*\.gguf', fname)
+                is_ggml = re.match(r".*ggml.*\.bin", fname)
+                is_tokenizer = re.match(r"(tokenizer|ice|spiece).*\.model", fname)
+                is_text = re.match(r".*\.(txt|json|py|md)", fname) or is_tokenizer
+                if any((is_pytorch, is_safetensors, is_pt, is_gguf, is_ggml, is_tokenizer, is_text)):
                     if 'lfs' in dict[i]:
                         sha256.append([fname, dict[i]['lfs']['oid']])
 
@@ -101,8 +103,11 @@ class ModelDownloader:
                         elif is_pt:
                             has_pt = True
                             classifications.append('pt')
+                        elif is_gguf:
+                            has_gguf = True
+                            classifications.append('gguf')
                         elif is_ggml:
-                            # has_ggml = True
+                            has_ggml = True
                             classifications.append('ggml')
 
             cursor = base64.b64encode(f'{{"file_name":"{dict[-1]["path"]}"}}'.encode()) + b':50'
@@ -115,6 +120,12 @@ class ModelDownloader:
                 if classifications[i] in ['pytorch', 'pt']:
                     links.pop(i)
 
+        # If both GGML and GGUF are available, download GGUF only
+        if has_ggml and has_gguf:
+            for i in range(len(classifications) - 1, -1, -1):
+                if classifications[i] == 'ggml':
+                    links.pop(i)
+
         return links, sha256, is_lora
 
     def get_output_folder(self, model, branch, is_lora, base_folder=None):
@@ -136,7 +147,7 @@ class ModelDownloader:
         if output_path.exists() and not start_from_scratch:
 
             # Check if the file has already been downloaded completely
-            r = self.s.get(url, stream=True, timeout=10)
+            r = self.session.get(url, stream=True, timeout=10)
             total_size = int(r.headers.get('content-length', 0))
             if output_path.stat().st_size >= total_size:
                 return
@@ -145,7 +156,7 @@ class ModelDownloader:
             headers = {'Range': f'bytes={output_path.stat().st_size}-'}
             mode = 'ab'
 
-        with self.s.get(url, stream=True, headers=headers, timeout=10) as r:
+        with self.session.get(url, stream=True, headers=headers, timeout=10) as r:
             r.raise_for_status()  # Do not continue the download if the request was unsuccessful
             total_size = int(r.headers.get('content-length', 0))
             block_size = 1024 * 1024  # 1MB
diff --git a/extensions/api/blocking_api.py b/extensions/api/blocking_api.py
index fbbc5ec1..6b28205a 100644
--- a/extensions/api/blocking_api.py
+++ b/extensions/api/blocking_api.py
@@ -200,7 +200,7 @@ class Handler(BaseHTTPRequestHandler):
         super().end_headers()
 
 
-def _run_server(port: int, share: bool = False):
+def _run_server(port: int, share: bool = False, tunnel_id=str):
     address = '0.0.0.0' if shared.args.listen else '127.0.0.1'
 
     server = ThreadingHTTPServer((address, port), Handler)
@@ -210,7 +210,7 @@ def _run_server(port: int, share: bool = False):
 
     if share:
         try:
-            try_start_cloudflared(port, max_attempts=3, on_start=on_start)
+            try_start_cloudflared(port, tunnel_id, max_attempts=3, on_start=on_start)
         except Exception:
             pass
     else:
@@ -220,5 +220,5 @@ def _run_server(port: int, share: bool = False):
     server.serve_forever()
 
 
-def start_server(port: int, share: bool = False):
-    Thread(target=_run_server, args=[port, share], daemon=True).start()
+def start_server(port: int, share: bool = False, tunnel_id=str):
+    Thread(target=_run_server, args=[port, share, tunnel_id], daemon=True).start()
diff --git a/extensions/api/requirements.txt b/extensions/api/requirements.txt
index 14e29d35..e4f26c3a 100644
--- a/extensions/api/requirements.txt
+++ b/extensions/api/requirements.txt
@@ -1,2 +1,2 @@
-flask_cloudflared==0.0.12
+flask_cloudflared==0.0.14
 websockets==11.0.2
\ No newline at end of file
diff --git a/extensions/api/script.py b/extensions/api/script.py
index 5d1b1a68..80617b3e 100644
--- a/extensions/api/script.py
+++ b/extensions/api/script.py
@@ -4,5 +4,5 @@ from modules import shared
 
 
 def setup():
-    blocking_api.start_server(shared.args.api_blocking_port, share=shared.args.public_api)
-    streaming_api.start_server(shared.args.api_streaming_port, share=shared.args.public_api)
+    blocking_api.start_server(shared.args.api_blocking_port, share=shared.args.public_api, tunnel_id=shared.args.public_api_id)
+    streaming_api.start_server(shared.args.api_streaming_port, share=shared.args.public_api, tunnel_id=shared.args.public_api_id)
diff --git a/extensions/api/streaming_api.py b/extensions/api/streaming_api.py
index 6afa827d..9175eeb0 100644
--- a/extensions/api/streaming_api.py
+++ b/extensions/api/streaming_api.py
@@ -102,7 +102,7 @@ async def _run(host: str, port: int):
         await asyncio.Future()  # run forever
 
 
-def _run_server(port: int, share: bool = False):
+def _run_server(port: int, share: bool = False, tunnel_id=str):
     address = '0.0.0.0' if shared.args.listen else '127.0.0.1'
 
     def on_start(public_url: str):
@@ -111,7 +111,7 @@ def _run_server(port: int, share: bool = False):
 
     if share:
         try:
-            try_start_cloudflared(port, max_attempts=3, on_start=on_start)
+            try_start_cloudflared(port, tunnel_id, max_attempts=3, on_start=on_start)
         except Exception as e:
             print(e)
     else:
@@ -120,5 +120,5 @@ def _run_server(port: int, share: bool = False):
     asyncio.run(_run(host=address, port=port))
 
 
-def start_server(port: int, share: bool = False):
-    Thread(target=_run_server, args=[port, share], daemon=True).start()
+def start_server(port: int, share: bool = False, tunnel_id=str):
+    Thread(target=_run_server, args=[port, share, tunnel_id], daemon=True).start()
diff --git a/extensions/api/util.py b/extensions/api/util.py
index 2358b7d2..032a9e5c 100644
--- a/extensions/api/util.py
+++ b/extensions/api/util.py
@@ -21,6 +21,7 @@ def build_parameters(body, chat=False):
 
     generate_params = {
         'max_new_tokens': int(body.get('max_new_tokens', body.get('max_length', 200))),
+        'auto_max_new_tokens': bool(body.get('auto_max_new_tokens', False)),
         'do_sample': bool(body.get('do_sample', True)),
         'temperature': float(body.get('temperature', 0.5)),
         'top_p': float(body.get('top_p', 1)),
@@ -42,6 +43,8 @@ def build_parameters(body, chat=False):
         'mirostat_mode': int(body.get('mirostat_mode', 0)),
         'mirostat_tau': float(body.get('mirostat_tau', 5)),
         'mirostat_eta': float(body.get('mirostat_eta', 0.1)),
+        'guidance_scale': float(body.get('guidance_scale', 1)),
+        'negative_prompt': str(body.get('negative_prompt', '')),
         'seed': int(body.get('seed', -1)),
         'add_bos_token': bool(body.get('add_bos_token', True)),
         'truncation_length': int(body.get('truncation_length', body.get('max_context_length', 2048))),
@@ -65,30 +68,28 @@ def build_parameters(body, chat=False):
         name1, name2, _, greeting, context, _ = load_character_memoized(character, str(body.get('your_name', shared.settings['name1'])), shared.settings['name2'], instruct=False)
         name1_instruct, name2_instruct, _, _, context_instruct, turn_template = load_character_memoized(instruction_template, '', '', instruct=True)
         generate_params.update({
-            'stop_at_newline': bool(body.get('stop_at_newline', shared.settings['stop_at_newline'])),
-            'chat_generation_attempts': int(body.get('chat_generation_attempts', shared.settings['chat_generation_attempts'])),
             'mode': str(body.get('mode', 'chat')),
-            'name1': name1,
-            'name2': name2,
-            'context': context,
-            'greeting': greeting,
-            'name1_instruct': name1_instruct,
-            'name2_instruct': name2_instruct,
-            'context_instruct': body.get('context_instruct', context_instruct),
-            'turn_template': turn_template,
-            'chat-instruct_command': str(body.get('chat-instruct_command', shared.settings['chat-instruct_command'])),
+            'name1': str(body.get('name1', name1)),
+            'name2': str(body.get('name2', name2)),
+            'context': str(body.get('context', context)),
+            'greeting': str(body.get('greeting', greeting)),
+            'name1_instruct': str(body.get('name1_instruct', name1_instruct)),
+            'name2_instruct': str(body.get('name2_instruct', name2_instruct)),
+            'context_instruct': str(body.get('context_instruct', context_instruct)),
+            'turn_template': str(body.get('turn_template', turn_template)),
+            'chat-instruct_command': str(body.get('chat_instruct_command', body.get('chat-instruct_command', shared.settings['chat-instruct_command']))),
             'history': body.get('history', {'internal': [], 'visible': []})
         })
 
     return generate_params
 
 
-def try_start_cloudflared(port: int, max_attempts: int = 3, on_start: Optional[Callable[[str], None]] = None):
+def try_start_cloudflared(port: int, tunnel_id: str, max_attempts: int = 3, on_start: Optional[Callable[[str], None]] = None):
     Thread(target=_start_cloudflared, args=[
-           port, max_attempts, on_start], daemon=True).start()
+           port, tunnel_id, max_attempts, on_start], daemon=True).start()
 
 
-def _start_cloudflared(port: int, max_attempts: int = 3, on_start: Optional[Callable[[str], None]] = None):
+def _start_cloudflared(port: int, tunnel_id: str, max_attempts: int = 3, on_start: Optional[Callable[[str], None]] = None):
     try:
         from flask_cloudflared import _run_cloudflared
     except ImportError:
@@ -98,7 +99,10 @@ def _start_cloudflared(port: int, max_attempts: int = 3, on_start: Optional[Call
 
     for _ in range(max_attempts):
         try:
-            public_url = _run_cloudflared(port, port + 1)
+            if tunnel_id is not None:
+                public_url = _run_cloudflared(port, port + 1, tunnel_id=tunnel_id)
+            else:
+                public_url = _run_cloudflared(port, port + 1)
 
             if on_start:
                 on_start(public_url)
diff --git a/extensions/elevenlabs_tts/script.py b/extensions/elevenlabs_tts/script.py
index f74e1047..68ae16bd 100644
--- a/extensions/elevenlabs_tts/script.py
+++ b/extensions/elevenlabs_tts/script.py
@@ -1,12 +1,13 @@
+import html
 import re
 from pathlib import Path
 
 import elevenlabs
 import gradio as gr
 
-from modules import chat, shared
-from modules.utils import gradio
+from modules import chat, shared, ui_chat
 from modules.logging_colors import logger
+from modules.utils import gradio
 
 params = {
     'activate': True,
@@ -111,7 +112,7 @@ def output_modifier(string):
     output_file = Path(f'extensions/elevenlabs_tts/outputs/{wav_idx:06d}.mp3'.format(wav_idx))
     print(f'Outputting audio to {str(output_file)}')
     try:
-        audio = elevenlabs.generate(text=string, voice=params['selected_voice'], model=params['model'])
+        audio = elevenlabs.generate(text=html.unescape(string), voice=params['selected_voice'], model=params['model'])
         elevenlabs.save(audio, str(output_file))
 
         autoplay = 'autoplay' if params['autoplay'] else ''
@@ -167,24 +168,23 @@ def ui():
         convert_cancel = gr.Button('Cancel', visible=False)
         convert_confirm = gr.Button('Confirm (cannot be undone)', variant="stop", visible=False)
 
-    if shared.is_chat():
-        # Convert history with confirmation
-        convert_arr = [convert_confirm, convert, convert_cancel]
-        convert.click(lambda: [gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)], None, convert_arr)
-        convert_confirm.click(
-            lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, convert_arr).then(
-            remove_tts_from_history, gradio('history'), gradio('history')).then(
-            chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
-            chat.redraw_html, shared.reload_inputs, gradio('display'))
+    # Convert history with confirmation
+    convert_arr = [convert_confirm, convert, convert_cancel]
+    convert.click(lambda: [gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)], None, convert_arr)
+    convert_confirm.click(
+        lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, convert_arr).then(
+        remove_tts_from_history, gradio('history'), gradio('history')).then(
+        chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
+        chat.redraw_html, gradio(ui_chat.reload_arr), gradio('display'))
 
-        convert_cancel.click(lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, convert_arr)
+    convert_cancel.click(lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, convert_arr)
 
-        # Toggle message text in history
-        show_text.change(
-            lambda x: params.update({"show_text": x}), show_text, None).then(
-            toggle_text_in_history, gradio('history'), gradio('history')).then(
-            chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
-            chat.redraw_html, shared.reload_inputs, gradio('display'))
+    # Toggle message text in history
+    show_text.change(
+        lambda x: params.update({"show_text": x}), show_text, None).then(
+        toggle_text_in_history, gradio('history'), gradio('history')).then(
+        chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
+        chat.redraw_html, gradio(ui_chat.reload_arr), gradio('display'))
 
     # Event functions to update the parameters in the backend
     activate.change(lambda x: params.update({'activate': x}), activate, None)
diff --git a/extensions/example/script.py b/extensions/example/script.py
index b4db7102..44f0cb3c 100644
--- a/extensions/example/script.py
+++ b/extensions/example/script.py
@@ -59,7 +59,7 @@ def chat_input_modifier(text, visible_text, state):
     """
     return text, visible_text
 
-def input_modifier(string, state):
+def input_modifier(string, state, is_chat=False):
     """
     In default/notebook modes, modifies the whole prompt.
 
@@ -92,7 +92,7 @@ def logits_processor_modifier(processor_list, input_ids):
     processor_list.append(MyLogits())
     return processor_list
 
-def output_modifier(string, state):
+def output_modifier(string, state, is_chat=False):
     """
     Modifies the LLM output before it gets presented.
 
diff --git a/extensions/gallery/script.js b/extensions/gallery/script.js
new file mode 100644
index 00000000..4ff23afc
--- /dev/null
+++ b/extensions/gallery/script.js
@@ -0,0 +1,33 @@
+let gallery_element = document.getElementById('gallery-extension');
+let chat_mode_element = document.getElementById('chat-mode');
+
+let extensions_block = document.getElementById('extensions');
+let extensions_block_size = extensions_block.childNodes.length;
+let gallery_only = (extensions_block_size == 5);
+
+document.querySelector('.header_bar').addEventListener('click', function(event) {
+    if (event.target.tagName === 'BUTTON') {
+        const buttonText = event.target.textContent.trim();
+
+        let chat_visible = (buttonText == 'Chat');
+        let default_visible = (buttonText == 'Default');
+        let notebook_visible = (buttonText == 'Notebook');
+        let chat_mode_visible = (chat_mode_element.offsetHeight > 0 && chat_mode_element.offsetWidth > 0);
+
+        // Only show this extension in the Chat tab
+        if (chat_visible) {
+            if (chat_mode_visible) {
+                gallery_element.style.display = 'block';
+                extensions_block.style.display = '';
+            } else {
+                gallery_element.style.display = 'none';
+                extensions_block.style.display = 'none';
+            }
+        } else {
+            gallery_element.style.display = 'none';
+            if (gallery_only) {
+                extensions_block.style.display = 'none';
+            }
+        }
+    }
+});
diff --git a/extensions/gallery/script.py b/extensions/gallery/script.py
index 993ef273..611a11f4 100644
--- a/extensions/gallery/script.py
+++ b/extensions/gallery/script.py
@@ -82,8 +82,13 @@ def select_character(evt: gr.SelectData):
     return (evt.value[1])
 
 
+def custom_js():
+    path_to_js = Path(__file__).parent.resolve() / 'script.js'
+    return open(path_to_js, 'r').read()
+
+
 def ui():
-    with gr.Accordion("Character gallery", open=False):
+    with gr.Accordion("Character gallery", open=False, elem_id='gallery-extension'):
         update = gr.Button("Refresh")
         gr.HTML(value="<style>" + generate_css() + "</style>")
         gallery = gr.Dataset(components=[gr.HTML(visible=False)],
diff --git a/extensions/long_replies/script.py b/extensions/long_replies/script.py
index a30b05a7..035e8c9e 100644
--- a/extensions/long_replies/script.py
+++ b/extensions/long_replies/script.py
@@ -28,7 +28,7 @@ class MyLogits(LogitsProcessor):
     def __call__(self, input_ids, scores):
         if input_ids.shape[-1] - initial_size < params["min_length"]:
             scores[...,self.newline_id] = -1000
-            scores[...,shared.tokenizer.eos_token_id] = -1000
+            # scores[...,shared.tokenizer.eos_token_id] = -1000
 
         # probs = torch.softmax(scores, dim=-1, dtype=torch.float)
         # probs[0] /= probs[0].sum()
diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index 646da958..3e277710 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -165,7 +165,7 @@ def messages_to_prompt(body: dict, req_params: dict, max_tokens):
     # Instruct models can be much better
     if shared.settings['instruction_template']:
         try:
-            instruct = yaml.safe_load(open(f"characters/instruction-following/{shared.settings['instruction_template']}.yaml", 'r'))
+            instruct = yaml.safe_load(open(f"instruction-templates/{shared.settings['instruction_template']}.yaml", 'r'))
 
             template = instruct['turn_template']
             system_message_template = "{message}"
@@ -193,7 +193,7 @@ def messages_to_prompt(body: dict, req_params: dict, max_tokens):
         except Exception as e:
             req_params['stopping_strings'].extend(['\nUser:', 'User:'])  # XXX User: prompt here also
 
-            print(f"Exception: When loading characters/instruction-following/{shared.settings['instruction_template']}.yaml: {repr(e)}")
+            print(f"Exception: When loading instruction-templates/{shared.settings['instruction_template']}.yaml: {repr(e)}")
             print("Warning: Loaded default instruction-following template for model.")
 
     else:
diff --git a/extensions/openai/defaults.py b/extensions/openai/defaults.py
index 52f0d641..ffef12d0 100644
--- a/extensions/openai/defaults.py
+++ b/extensions/openai/defaults.py
@@ -4,6 +4,7 @@ import copy
 # Data type is important, Ex. use 0.0 for a float 0
 default_req_params = {
     'max_new_tokens': 16,  # 'Inf' for chat
+    'auto_max_new_tokens': False,
     'temperature': 1.0,
     'top_p': 1.0,
     'top_k': 1,  # choose 20 for chat in absence of another default
@@ -32,6 +33,8 @@ default_req_params = {
     'mirostat_mode': 0,
     'mirostat_tau': 5.0,
     'mirostat_eta': 0.1,
+    'guidance_scale': 1,
+    'negative_prompt': '',
     'ban_eos_token': False,
     'skip_special_tokens': True,
     'custom_stopping_strings': '',
diff --git a/extensions/openai/edits.py b/extensions/openai/edits.py
index f10f5779..2b527dc0 100644
--- a/extensions/openai/edits.py
+++ b/extensions/openai/edits.py
@@ -31,7 +31,7 @@ def edits(instruction: str, input: str, temperature=1.0, top_p=1.0) -> dict:
             stopping_strings.extend(['\n###'])
         else:
             try:
-                instruct = yaml.safe_load(open(f"characters/instruction-following/{shared.settings['instruction_template']}.yaml", 'r'))
+                instruct = yaml.safe_load(open(f"instruction-templates/{shared.settings['instruction_template']}.yaml", 'r'))
 
                 template = instruct['turn_template']
                 template = template\
@@ -45,7 +45,7 @@ def edits(instruction: str, input: str, temperature=1.0, top_p=1.0) -> dict:
 
             except Exception as e:
                 instruction_template = default_template
-                print(f"Exception: When loading characters/instruction-following/{shared.settings['instruction_template']}.yaml: {repr(e)}")
+                print(f"Exception: When loading instruction-templates/{shared.settings['instruction_template']}.yaml: {repr(e)}")
                 print("Warning: Loaded default instruction-following template (Alpaca) for model.")
     else:
         stopping_strings.extend(['\n###'])
diff --git a/extensions/openai/script.py b/extensions/openai/script.py
index f95205a5..d1faa019 100644
--- a/extensions/openai/script.py
+++ b/extensions/openai/script.py
@@ -67,10 +67,13 @@ class Handler(BaseHTTPRequestHandler):
         self.send_response(code)
         self.send_access_control_headers()
         self.send_header('Content-Type', 'application/json')
-        self.end_headers()
 
         response = json.dumps(ret)
         r_utf8 = response.encode('utf-8')
+
+        self.send_header('Content-Length', str(len(r_utf8)))
+        self.end_headers()
+
         self.wfile.write(r_utf8)
         if not no_debug:
             debug_msg(r_utf8)
diff --git a/extensions/perplexity_colors/script.py b/extensions/perplexity_colors/script.py
index 84b62a30..2a986ac4 100644
--- a/extensions/perplexity_colors/script.py
+++ b/extensions/perplexity_colors/script.py
@@ -1,17 +1,22 @@
+import time
+
 import gradio
+import numpy as np
 import torch
 from transformers import LogitsProcessor
-import numpy as np
 
-from modules import shared
+from modules import html_generator, shared
 
 params = {
+    'active': True,
     'color_by_perplexity': False,
     'color_by_probability': False,
-    'ppl_scale': 15.0, # No slider for this right now, because I don't think it really needs to be changed. Very large perplexity scores don't show up often.
-    #'probability_dropdown': False
+    'ppl_scale': 15.0,  # No slider for this right now, because I don't think it really needs to be changed. Very large perplexity scores don't show up often.
+    'probability_dropdown': False,
+    'verbose': False  # For debugging mostly
 }
 
+
 class PerplexityLogits(LogitsProcessor):
     def __init__(self, verbose=False):
         self.generated_token_ids = []
@@ -23,9 +28,10 @@ class PerplexityLogits(LogitsProcessor):
         self.verbose = verbose
 
     def __call__(self, input_ids, scores):
+        # t0 = time.time()
         probs = torch.softmax(scores, dim=-1, dtype=torch.float)
-        log_probs = torch.nan_to_num(torch.log(probs))
-        entropy = -torch.sum(probs*log_probs)
+        log_probs = torch.nan_to_num(torch.log(probs))  # Note: This is to convert log(0) nan to 0, but probs*log_probs makes this 0 not affect the perplexity.
+        entropy = -torch.sum(probs * log_probs)
         entropy = entropy.cpu().numpy()
         perplexity = round(float(np.exp(entropy)), 4)
         self.perplexities_list.append(perplexity)
@@ -36,25 +42,25 @@ class PerplexityLogits(LogitsProcessor):
         if len(self.selected_probs) > 0:
             # Is the selected token in the top tokens?
             if self.verbose:
-                print(shared.tokenizer.decode(last_token_id))
-                print([shared.tokenizer.decode(token_id) for token_id in self.top_token_ids_list[-1]])
-                print(self.top_probs_list[-1])
-            if last_token_id in self.top_token_ids_list[-1]:
-                idx = self.top_token_ids_list[-1].index(last_token_id)
-                self.selected_probs.append(self.top_probs_list[-1][idx])
+                print('Probs: Token after', shared.tokenizer.decode(last_token_id))
+                print('Probs:', [shared.tokenizer.decode(token_id) for token_id in self.top_token_ids_list[-1][0]])
+                print('Probs:', [round(float(prob), 4) for prob in self.top_probs_list[-1][0]])
+            if last_token_id in self.top_token_ids_list[-1][0]:
+                idx = self.top_token_ids_list[-1][0].index(last_token_id)
+                self.selected_probs.append(self.top_probs_list[-1][0][idx])
             else:
-                self.top_token_ids_list[-1].append(last_token_id)
+                self.top_token_ids_list[-1][0].append(last_token_id)
                 last_prob = round(float(self.last_probs[last_token_id]), 4)
-                self.top_probs_list[-1].append(last_prob)
+                self.top_probs_list[-1][0].append(last_prob)
                 self.selected_probs.append(last_prob)
         else:
-            self.selected_probs.append(1.0) # Placeholder for the last token of the prompt
+            self.selected_probs.append(1.0)  # Placeholder for the last token of the prompt
 
         if self.verbose:
             pplbar = "-"
             if not np.isnan(perplexity):
-                pplbar = "*"*round(perplexity)
-            print(f"{last_token}\t{perplexity:.2f}\t{pplbar}")
+                pplbar = "*" * round(perplexity)
+            print(f"PPL: Token after {shared.tokenizer.decode(last_token_id)}\t{perplexity:.2f}\t{pplbar}")
 
         # Get top 5 probabilities
         top_tokens_and_probs = torch.topk(probs, 5)
@@ -63,153 +69,241 @@ class PerplexityLogits(LogitsProcessor):
 
         self.top_token_ids_list.append(top_token_ids)
         self.top_probs_list.append(top_probs)
-        
-        probs = probs.cpu().numpy().flatten()
-        self.last_probs = probs # Need to keep this as a reference for top probs
 
+        probs = probs.cpu().numpy().flatten()
+        self.last_probs = probs  # Need to keep this as a reference for top probs
+
+        # t1 = time.time()
+        # print(f"PPL Processor: {(t1-t0):.3f} s")
+        # About 1 ms, though occasionally up to around 100 ms, not sure why...
         # Doesn't actually modify the logits!
         return scores
 
+
 # Stores the perplexity and top probabilities
 ppl_logits_processor = None
 
+
 def logits_processor_modifier(logits_processor_list, input_ids):
     global ppl_logits_processor
-    ppl_logits_processor = PerplexityLogits()
-    logits_processor_list.append(ppl_logits_processor)
+    if params['active']:
+        ppl_logits_processor = PerplexityLogits(verbose=params['verbose'])
+        logits_processor_list.append(ppl_logits_processor)
+
 
 def output_modifier(text):
     global ppl_logits_processor
+    # t0 = time.time()
+
+    if not params['active']:
+        return text
 
     # TODO: It's probably more efficient to do this above rather than modifying all these lists
     # Remove last element of perplexities_list, top_token_ids_list, top_tokens_list, top_probs_list since everything is off by one because this extension runs before generation
     perplexities = ppl_logits_processor.perplexities_list[:-1]
     top_token_ids_list = ppl_logits_processor.top_token_ids_list[:-1]
-    top_tokens_list = [[shared.tokenizer.decode(token_id) for token_id in top_token_ids] for top_token_ids in top_token_ids_list]
+    top_tokens_list = [[shared.tokenizer.decode(token_id) for token_id in top_token_ids[0]] for top_token_ids in top_token_ids_list]
     top_probs_list = ppl_logits_processor.top_probs_list[:-1]
     # Remove first element of generated_token_ids, generated_tokens, selected_probs because they are for the last token of the prompt
     gen_token_ids = ppl_logits_processor.generated_token_ids[1:]
     gen_tokens = [shared.tokenizer.decode(token_id) for token_id in gen_token_ids]
     sel_probs = ppl_logits_processor.selected_probs[1:]
 
-    end_part = '</span>' # Helps with finding the index after replacing part of the text.
-    in_code = False # Since the <span> tags mess up code blocks, avoid coloring while inside a code block, based on finding tokens with '`' in them
+    end_part = '</div></div>' if params['probability_dropdown'] else '</span>'  # Helps with finding the index after replacing part of the text.
 
-    if params['color_by_probability'] and params['color_by_perplexity']:
-        i = 0
-        for token, prob, ppl, top_tokens, top_probs in zip(gen_tokens, sel_probs, perplexities, top_tokens_list, top_probs_list):
-            if '`' in token:
-                in_code = not in_code
-                continue
-            if in_code:
-                continue
+    i = 0
+    for token, prob, ppl, top_tokens, top_probs in zip(gen_tokens, sel_probs, perplexities, top_tokens_list, top_probs_list):
+        color = 'ffffff'
+        if params['color_by_probability'] and params['color_by_perplexity']:
             color = probability_perplexity_color_scale(prob, ppl)
-            if token in text[i:]:
-                text = text[:i] + text[i:].replace(token, add_color_html(token, color), 1)
-                i += text[i:].find(end_part) + len(end_part)
-    elif params['color_by_perplexity']:
-        i = 0
-        for token, ppl, top_tokens, top_probs in zip(gen_tokens, perplexities, top_tokens_list, top_probs_list):
-            if '`' in token:
-                in_code = not in_code
-                continue
-            if in_code:
-                continue
+        elif params['color_by_perplexity']:
             color = perplexity_color_scale(ppl)
-            if token in text[i:]:
-                text = text[:i] + text[i:].replace(token, add_color_html(token, color), 1)
-                i += text[i:].find(end_part) + len(end_part)
-    elif params['color_by_probability']:
-        i = 0
-        for token, prob, top_tokens, top_probs in zip(gen_tokens, sel_probs, top_tokens_list, top_probs_list):
-            if '`' in token:
-                in_code = not in_code
-                continue
-            if in_code:
-                continue
+        elif params['color_by_probability']:
             color = probability_color_scale(prob)
-            if token in text[i:]:
+        if token in text[i:]:
+            if params['probability_dropdown']:
+                text = text[:i] + text[i:].replace(token, add_dropdown_html(token, color, top_tokens, top_probs[0], ppl), 1)
+            else:
                 text = text[:i] + text[i:].replace(token, add_color_html(token, color), 1)
-                i += text[i:].find(end_part) + len(end_part)
+            i += text[i:].find(end_part) + len(end_part)
 
-    print('Average perplexity:', round(np.mean(perplexities), 4))
+    # Use full perplexity list for calculating the average here.
+    print('Average perplexity:', round(np.mean(ppl_logits_processor.perplexities_list[:-1]), 4))
+    # t1 = time.time()
+    # print(f"Modifier: {(t1-t0):.3f} s")
+    # About 50 ms
     return text
 
-# Green-yellow-red color scale
+
 def probability_color_scale(prob):
+    '''
+    Green-yellow-red color scale
+    '''
+
     rv = 0
     gv = 0
     if prob <= 0.5:
         rv = 'ff'
-        gv = hex(int(255*prob*2))[2:]
+        gv = hex(int(255 * prob * 2))[2:]
         if len(gv) < 2:
-            gv = '0'*(2 - len(gv)) + gv
+            gv = '0' * (2 - len(gv)) + gv
     else:
-        rv = hex(int(255 - 255*(prob - 0.5)*2))[2:]
+        rv = hex(int(255 - 255 * (prob - 0.5) * 2))[2:]
         gv = 'ff'
         if len(rv) < 2:
-            rv = '0'*(2 - len(rv)) + rv
+            rv = '0' * (2 - len(rv)) + rv
+
     return rv + gv + '00'
 
-# Red component only, white for 0 perplexity (sorry if you're not in dark mode)
+
 def perplexity_color_scale(ppl):
-    value = hex(max(int(255.0 - params['ppl_scale']*(float(ppl)-1.0)), 0))[2:]
+    '''
+    Red component only, white for 0 perplexity (sorry if you're not in dark mode)
+    '''
+    value = hex(max(int(255.0 - params['ppl_scale'] * (float(ppl) - 1.0)), 0))[2:]
     if len(value) < 2:
-        value = '0'*(2 - len(value)) + value
+        value = '0' * (2 - len(value)) + value
+
     return 'ff' + value + value
 
-# Green-yellow-red for probability and blue component for perplexity
+
 def probability_perplexity_color_scale(prob, ppl):
+    '''
+    Green-yellow-red for probability and blue component for perplexity
+    '''
+
     rv = 0
     gv = 0
-    bv = hex(min(max(int(params['ppl_scale']*(float(ppl)-1.0)), 0), 255))[2:]
+    bv = hex(min(max(int(params['ppl_scale'] * (float(ppl) - 1.0)), 0), 255))[2:]
     if len(bv) < 2:
-            bv = '0'*(2 - len(bv)) + bv
+        bv = '0' * (2 - len(bv)) + bv
+
     if prob <= 0.5:
         rv = 'ff'
-        gv = hex(int(255*prob*2))[2:]
+        gv = hex(int(255 * prob * 2))[2:]
         if len(gv) < 2:
-            gv = '0'*(2 - len(gv)) + gv
+            gv = '0' * (2 - len(gv)) + gv
     else:
-        rv = hex(int(255 - 255*(prob - 0.5)*2))[2:]
+        rv = hex(int(255 - 255 * (prob - 0.5) * 2))[2:]
         gv = 'ff'
         if len(rv) < 2:
-            rv = '0'*(2 - len(rv)) + rv
+            rv = '0' * (2 - len(rv)) + rv
+
     return rv + gv + bv
 
+
 def add_color_html(token, color):
     return f'<span style="color: #{color}">{token}</span>'
 
-"""
-# This is still very broken at the moment, needs CSS too but I'm not very good at CSS (and neither is GPT-4 apparently) so I still need to figure that out.
-def add_dropdown_html(token, color, top_tokens, top_probs):
-    html = f'<span class="hoverable" style="color: #{color}">{token}<div class="dropdown"><table class="dropdown-content">'
-    for token, prob in zip(top_tokens, top_probs):
-        # TODO: Background color? Bold for selected token?
-        # Bigger issue: Why is there a newline after the first token, and the dropdown fails there?
-        # The HTML ends up like <p><span>word</span></p><div>...</div>,
-        # even though for all other tokens it shows up correctly.
+
+# TODO: Major issue: Applying this to too many tokens will cause a permanent slowdown in generation speed until the messages are removed from the history.
+# I think the issue is from HTML elements taking up space in the visible history, and things like history deepcopy add latency proportional to the size of the history.
+# Potential solution is maybe to modify the main generation code to send just the internal text and not the visible history, to avoid moving too much around.
+# I wonder if we can also avoid using deepcopy here.
+def add_dropdown_html(token, color, top_tokens, top_probs, perplexity=0):
+    html = f'<div class="hoverable"><span style="color: #{color}">{token}</span><div class="dropdown"><table class="dropdown-content"><tbody>'
+    for token_option, prob in zip(top_tokens, top_probs):
+        # TODO: Bold for selected token?
+        # Using divs prevented the problem of divs inside spans causing issues.
+        # Now the problem is that divs show the same whitespace of one space between every token.
+        # There is probably some way to fix this in CSS that I don't know about.
         row_color = probability_color_scale(prob)
-        html += f'<tr><td style="color: #{row_color}">{token}</td><td style="color: #{row_color}">{prob}</td></tr>'
-    html += '</table></div></span>'
-    return html
-"""
+        row_class = ' class="selected"' if token_option == token else ''
+        html += f'<tr{row_class}><td style="color: #{row_color}">{token_option}</td><td style="color: #{row_color}">{prob:.4f}</td></tr>'
+    if perplexity != 0:
+        ppl_color = perplexity_color_scale(perplexity)
+        html += f'<tr><td>Perplexity:</td><td style="color: #{ppl_color}">{perplexity:.4f}</td></tr>'
+    html += '</tbody></table></div></div>'
+    return html  # About 750 characters per token...
+
+
+def custom_css():
+    return """
+        .dropdown {
+            display: none;
+            position: absolute;
+            z-index: 50;
+            background-color: var(--block-background-fill);
+            box-shadow: 0px 8px 16px 0px rgba(0,0,0,0.2);
+            width: max-content;
+            overflow: visible;
+            padding: 5px;
+            border-radius: 10px;
+            border: 1px solid var(--border-color-primary);
+        }
+
+        .dropdown-content {
+            border: none;
+            z-index: 50;
+        }
+
+        .dropdown-content tr.selected {
+            background-color: var(--block-label-background-fill);
+        }
+
+        .dropdown-content td {
+            color: var(--body-text-color);
+        }
+
+        .hoverable {
+            color: var(--body-text-color);
+            position: relative;
+            display: inline-block;
+            overflow: visible;
+            font-size: 15px;
+            line-height: 1.75;
+            margin: 0;
+            padding: 0;
+        }
+
+        .hoverable:hover .dropdown {
+            display: block;
+        }
+
+        pre {
+            white-space: pre-wrap;
+        }
+
+        # TODO: This makes the hover menus extend outside the bounds of the chat area, which is good.
+        # However, it also makes the scrollbar disappear, which is bad.
+        # The scroll bar needs to still be present. So for now, we can't see dropdowns that extend past the edge of the chat area.
+        #.chat {
+        #    overflow-y: auto;
+        #}
+    """
+
+
+# Monkeypatch applied to html_generator.py
+# We simply don't render markdown into HTML. We wrap everything in <pre> tags to preserve whitespace
+# formatting. If you're coloring tokens by perplexity or probability, or especially if you're using
+# the probability dropdown, you probably care more about seeing the tokens the model actually outputted
+# rather than rendering ```code blocks``` or *italics*.
+def convert_to_markdown(string):
+    return '<pre>' + string + '</pre>'
+
+
+html_generator.convert_to_markdown = convert_to_markdown
+
 
 def ui():
-    color_by_ppl_check = gradio.Checkbox(value=False, label="Color by perplexity", info="Higher perplexity is more red. If also showing probability, higher perplexity has more blue component.")
+    def update_active_check(x):
+        params.update({'active': x})
+
     def update_color_by_ppl_check(x):
         params.update({'color_by_perplexity': x})
-    color_by_ppl_check.change(update_color_by_ppl_check, color_by_ppl_check, None)
 
-    color_by_prob_check = gradio.Checkbox(value=False, label="Color by probability", info="Green-yellow-red linear scale, with 100% green, 50% yellow, 0% red.")
     def update_color_by_prob_check(x):
         params.update({'color_by_probability': x})
-    color_by_prob_check.change(update_color_by_prob_check, color_by_prob_check, None)
 
-    # Doesn't work yet...
-    """
-    prob_dropdown_check = gradio.Checkbox(value=False, label="Probability dropdown")
     def update_prob_dropdown_check(x):
         params.update({'probability_dropdown': x})
+
+    active_check = gradio.Checkbox(value=True, label="Compute probabilities and perplexity scores", info="Activate this extension. Note that this extension currently does not work with exllama or llama.cpp.")
+    color_by_ppl_check = gradio.Checkbox(value=False, label="Color by perplexity", info="Higher perplexity is more red. If also showing probability, higher perplexity has more blue component.")
+    color_by_prob_check = gradio.Checkbox(value=False, label="Color by probability", info="Green-yellow-red linear scale, with 100% green, 50% yellow, 0% red.")
+    prob_dropdown_check = gradio.Checkbox(value=False, label="Probability dropdown", info="Hover over a token to show a dropdown of top token probabilities. Currently slightly buggy with whitespace between tokens.")
+
+    active_check.change(update_active_check, active_check, None)
+    color_by_ppl_check.change(update_color_by_ppl_check, color_by_ppl_check, None)
+    color_by_prob_check.change(update_color_by_prob_check, color_by_prob_check, None)
     prob_dropdown_check.change(update_prob_dropdown_check, prob_dropdown_check, None)
-    """
diff --git a/extensions/sd_api_pictures/script.py b/extensions/sd_api_pictures/script.py
index 88a0d940..e33367d4 100644
--- a/extensions/sd_api_pictures/script.py
+++ b/extensions/sd_api_pictures/script.py
@@ -133,6 +133,9 @@ def get_SD_pictures(description, character):
     if params['manage_VRAM']:
         give_VRAM_priority('SD')
 
+    description = re.sub('<audio.*?</audio>', ' ', description)
+    description = f"({description}:1)"
+
     payload = {
         "prompt": params['prompt_prefix'] + description,
         "seed": params['seed'],
diff --git a/extensions/send_pictures/script.py b/extensions/send_pictures/script.py
index 39c9362a..f8e6c969 100644
--- a/extensions/send_pictures/script.py
+++ b/extensions/send_pictures/script.py
@@ -5,7 +5,7 @@ import gradio as gr
 import torch
 from transformers import BlipForConditionalGeneration, BlipProcessor
 
-from modules import chat, shared
+from modules import chat, shared, ui_chat
 from modules.ui import gather_interface_values
 from modules.utils import gradio
 
@@ -54,5 +54,5 @@ def ui():
             "value": generate_chat_picture(picture, name1, name2)
         }), [picture_select, shared.gradio['name1'], shared.gradio['name2']], None).then(
         gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        chat.generate_chat_reply_wrapper, shared.input_params, gradio('display', 'history'), show_progress=False).then(
+        chat.generate_chat_reply_wrapper, gradio(ui_chat.inputs), gradio('display', 'history'), show_progress=False).then(
         lambda: None, None, picture_select, show_progress=False)
diff --git a/extensions/silero_tts/harvard_sentences.txt b/extensions/silero_tts/harvard_sentences.txt
new file mode 100644
index 00000000..958d7f3c
--- /dev/null
+++ b/extensions/silero_tts/harvard_sentences.txt
@@ -0,0 +1,720 @@
+The birch canoe slid on the smooth planks.
+Glue the sheet to the dark blue background.
+It's easy to tell the depth of a well.
+These days a chicken leg is a rare dish.
+Rice is often served in round bowls.
+The juice of lemons makes fine punch.
+The box was thrown beside the parked truck.
+The hogs were fed chopped corn and garbage.
+Four hours of steady work faced us.
+A large size in stockings is hard to sell.
+The boy was there when the sun rose.
+A rod is used to catch pink salmon.
+The source of the huge river is the clear spring.
+Kick the ball straight and follow through.
+Help the woman get back to her feet.
+A pot of tea helps to pass the evening.
+Smoky fires lack flame and heat.
+The soft cushion broke the man's fall.
+The salt breeze came across from the sea.
+The girl at the booth sold fifty bonds.
+The small pup gnawed a hole in the sock.
+The fish twisted and turned on the bent hook.
+Press the pants and sew a button on the vest.
+The swan dive was far short of perfect.
+The beauty of the view stunned the young boy.
+Two blue fish swam in the tank.
+Her purse was full of useless trash.
+The colt reared and threw the tall rider.
+It snowed, rained, and hailed the same morning.
+Read verse out loud for pleasure.
+Hoist the load to your left shoulder.
+Take the winding path to reach the lake.
+Note closely the size of the gas tank.
+Wipe the grease off his dirty face.
+Mend the coat before you go out.
+The wrist was badly strained and hung limp.
+The stray cat gave birth to kittens.
+The young girl gave no clear response.
+The meal was cooked before the bell rang.
+What joy there is in living.
+A king ruled the state in the early days.
+The ship was torn apart on the sharp reef.
+Sickness kept him home the third week.
+The wide road shimmered in the hot sun.
+The lazy cow lay in the cool grass.
+Lift the square stone over the fence.
+The rope will bind the seven books at once.
+Hop over the fence and plunge in.
+The friendly gang left the drug store.
+Mesh wire keeps chicks inside.
+The frosty air passed through the coat.
+The crooked maze failed to fool the mouse.
+Adding fast leads to wrong sums.
+The show was a flop from the very start.
+A saw is a tool used for making boards.
+The wagon moved on well oiled wheels.
+March the soldiers past the next hill.
+A cup of sugar makes sweet fudge.
+Place a rosebush near the porch steps.
+Both lost their lives in the raging storm.
+We talked of the side show in the circus.
+Use a pencil to write the first draft.
+He ran half way to the hardware store.
+The clock struck to mark the third period.
+A small creek cut across the field.
+Cars and busses stalled in snow drifts.
+The set of china hit the floor with a crash.
+This is a grand season for hikes on the road.
+The dune rose from the edge of the water.
+Those words were the cue for the actor to leave.
+A yacht slid around the point into the bay.
+The two met while playing on the sand.
+The ink stain dried on the finished page.
+The walled town was seized without a fight.
+The lease ran out in sixteen weeks.
+A tame squirrel makes a nice pet.
+The horn of the car woke the sleeping cop.
+The heart beat strongly and with firm strokes.
+The pearl was worn in a thin silver ring.
+The fruit peel was cut in thick slices.
+The Navy attacked the big task force.
+See the cat glaring at the scared mouse.
+There are more than two factors here.
+The hat brim was wide and too droopy.
+The lawyer tried to lose his case.
+The grass curled around the fence post.
+Cut the pie into large parts.
+Men strive but seldom get rich.
+Always close the barn door tight.
+He lay prone and hardly moved a limb.
+The slush lay deep along the street.
+A wisp of cloud hung in the blue air.
+A pound of sugar costs more than eggs.
+The fin was sharp and cut the clear water.
+The play seems dull and quite stupid.
+Bail the boat to stop it from sinking.
+The term ended in late June that year.
+A tusk is used to make costly gifts.
+Ten pins were set in order.
+The bill was paid every third week.
+Oak is strong and also gives shade.
+Cats and dogs each hate the other.
+The pipe began to rust while new.
+Open the crate but don't break the glass.
+Add the sum to the product of these three.
+Thieves who rob friends deserve jail.
+The ripe taste of cheese improves with age.
+Act on these orders with great speed.
+The hog crawled under the high fence.
+Move the vat over the hot fire.
+The bark of the pine tree was shiny and dark.
+Leaves turn brown and yellow in the fall.
+The pennant waved when the wind blew.
+Split the log with a quick, sharp blow.
+Burn peat after the logs give out.
+He ordered peach pie with ice cream.
+Weave the carpet on the right hand side.
+Hemp is a weed found in parts of the tropics.
+A lame back kept his score low.
+We find joy in the simplest things.
+Type out three lists of orders.
+The harder he tried the less he got done.
+The boss ran the show with a watchful eye.
+The cup cracked and spilled its contents.
+Paste can cleanse the most dirty brass.
+The slang word for raw whiskey is booze.
+It caught its hind paw in a rusty trap.
+The wharf could be seen at the farther shore.
+Feel the heat of the weak dying flame.
+The tiny girl took off her hat.
+A cramp is no small danger on a swim.
+He said the same phrase thirty times.
+Pluck the bright rose without leaves.
+Two plus seven is less than ten.
+The glow deepened in the eyes of the sweet girl.
+Bring your problems to the wise chief.
+Write a fond note to the friend you cherish.
+Clothes and lodging are free to new men.
+We frown when events take a bad turn.
+Port is a strong wine with a smoky taste.
+The young kid jumped the rusty gate.
+Guess the results from the first scores.
+A salt pickle tastes fine with ham.
+The just claim got the right verdict.
+These thistles bend in a high wind.
+Pure bred poodles have curls.
+The tree top waved in a graceful way.
+The spot on the blotter was made by green ink.
+Mud was spattered on the front of his white shirt.
+The cigar burned a hole in the desk top.
+The empty flask stood on the tin tray.
+A speedy man can beat this track mark.
+He broke a new shoelace that day.
+The coffee stand is too high for the couch.
+The urge to write short stories is rare.
+The pencils have all been used.
+The pirates seized the crew of the lost ship.
+We tried to replace the coin but failed.
+She sewed the torn coat quite neatly.
+The sofa cushion is red and of light weight.
+The jacket hung on the back of the wide chair.
+At that high level the air is pure.
+Drop the two when you add the figures.
+A filing case is now hard to buy.
+An abrupt start does not win the prize.
+Wood is best for making toys and blocks.
+The office paint was a dull, sad tan.
+He knew the skill of the great young actress.
+A rag will soak up spilled water.
+A shower of dirt fell from the hot pipes.
+Steam hissed from the broken valve.
+The child almost hurt the small dog.
+There was a sound of dry leaves outside.
+The sky that morning was clear and bright blue.
+Torn scraps littered the stone floor.
+Sunday is the best part of the week.
+The doctor cured him with these pills.
+The new girl was fired today at noon.
+They felt gay when the ship arrived in port.
+Add the store's account to the last cent.
+Acid burns holes in wool cloth.
+Fairy tales should be fun to write.
+Eight miles of woodland burned to waste.
+The third act was dull and tired the players.
+A young child should not suffer fright.
+Add the column and put the sum here.
+We admire and love a good cook.
+There the flood mark is ten inches.
+He carved a head from the round block of marble.
+She has a smart way of wearing clothes.
+The fruit of a fig tree is apple-shaped.
+Corn cobs can be used to kindle a fire.
+Where were they when the noise started.
+The paper box is full of thumb tacks.
+Sell your gift to a buyer at a good gain.
+The tongs lay beside the ice pail.
+The petals fall with the next puff of wind.
+Bring your best compass to the third class.
+They could laugh although they were sad.
+Farmers came in to thresh the oat crop.
+The brown house was on fire to the attic.
+The lure is used to catch trout and flounder.
+Float the soap on top of the bath water.
+A blue crane is a tall wading bird.
+A fresh start will work such wonders.
+The club rented the rink for the fifth night.
+After the dance, they went straight home.
+The hostess taught the new maid to serve.
+He wrote his last novel there at the inn.
+Even the worst will beat his low score.
+The cement had dried when he moved it.
+The loss of the second ship was hard to take.
+The fly made its way along the wall.
+Do that with a wooden stick.
+Live wires should be kept covered.
+The large house had hot water taps.
+It is hard to erase blue or red ink.
+Write at once or you may forget it.
+The doorknob was made of bright clean brass.
+The wreck occurred by the bank on Main Street.
+A pencil with black lead writes best.
+Coax a young calf to drink from a bucket.
+Schools for ladies teach charm and grace.
+The lamp shone with a steady green flame.
+They took the axe and the saw to the forest.
+The ancient coin was quite dull and worn.
+The shaky barn fell with a loud crash.
+Jazz and swing fans like fast music.
+Rake the rubbish up and then burn it.
+Slash the gold cloth into fine ribbons.
+Try to have the court decide the case.
+They are pushed back each time they attack.
+He broke his ties with groups of former friends.
+They floated on the raft to sun their white backs.
+The map had an X that meant nothing.
+Whitings are small fish caught in nets.
+Some ads serve to cheat buyers.
+Jerk the rope and the bell rings weakly.
+A waxed floor makes us lose balance.
+Madam, this is the best brand of corn.
+On the islands the sea breeze is soft and mild.
+The play began as soon as we sat down.
+This will lead the world to more sound and fury.
+Add salt before you fry the egg.
+The rush for funds reached its peak Tuesday.
+The birch looked stark white and lonesome.
+The box is held by a bright red snapper.
+To make pure ice, you freeze water.
+The first worm gets snapped early.
+Jump the fence and hurry up the bank.
+Yell and clap as the curtain slides back.
+They are men who walk the middle of the road.
+Both brothers wear the same size.
+In some form or other we need fun.
+The prince ordered his head chopped off.
+The houses are built of red clay bricks.
+Ducks fly north but lack a compass.
+Fruit flavors are used in fizz drinks.
+These pills do less good than others.
+Canned pears lack full flavor.
+The dark pot hung in the front closet.
+Carry the pail to the wall and spill it there.
+The train brought our hero to the big town.
+We are sure that one war is enough.
+Gray paint stretched for miles around.
+The rude laugh filled the empty room.
+High seats are best for football fans.
+Tea served from the brown jug is tasty.
+A dash of pepper spoils beef stew.
+A zestful food is the hot-cross bun.
+The horse trotted around the field at a brisk pace.
+Find the twin who stole the pearl necklace.
+Cut the cord that binds the box tightly.
+The red tape bound the smuggled food.
+Look in the corner to find the tan shirt.
+The cold drizzle will halt the bond drive.
+Nine men were hired to dig the ruins.
+The junk yard had a mouldy smell.
+The flint sputtered and lit a pine torch.
+Soak the cloth and drown the sharp odor.
+The shelves were bare of both jam or crackers.
+A joy to every child is the swan boat.
+All sat frozen and watched the screen.
+A cloud of dust stung his tender eyes.
+To reach the end he needs much courage.
+Shape the clay gently into block form.
+A ridge on a smooth surface is a bump or flaw.
+Hedge apples may stain your hands green.
+Quench your thirst, then eat the crackers.
+Tight curls get limp on rainy days.
+The mute muffled the high tones of the horn.
+The gold ring fits only a pierced ear.
+The old pan was covered with hard fudge.
+Watch the log float in the wide river.
+The node on the stalk of wheat grew daily.
+The heap of fallen leaves was set on fire.
+Write fast if you want to finish early.
+His shirt was clean but one button was gone.
+The barrel of beer was a brew of malt and hops.
+Tin cans are absent from store shelves.
+Slide the box into that empty space.
+The plant grew large and green in the window.
+The beam dropped down on the workmen's head.
+Pink clouds floated with the breeze.
+She danced like a swan, tall and graceful.
+The tube was blown and the tire flat and useless.
+It is late morning on the old wall clock.
+Let's all join as we sing the last chorus.
+The last switch cannot be turned off.
+The fight will end in just six minutes.
+The store walls were lined with colored frocks.
+The peace league met to discuss their plans.
+The rise to fame of a person takes luck.
+Paper is scarce, so write with much care.
+The quick fox jumped on the sleeping cat.
+The nozzle of the fire hose was bright brass.
+Screw the round cap on as tight as needed.
+Time brings us many changes.
+The purple tie was ten years old.
+Men think and plan and sometimes act.
+Fill the ink jar with sticky glue.
+He smoke a big pipe with strong contents.
+We need grain to keep our mules healthy.
+Pack the records in a neat thin case.
+The crunch of feet in the snow was the only sound.
+The copper bowl shone in the sun's rays.
+Boards will warp unless kept dry.
+The plush chair leaned against the wall.
+Glass will clink when struck by metal.
+Bathe and relax in the cool green grass.
+Nine rows of soldiers stood in line.
+The beach is dry and shallow at low tide.
+The idea is to sew both edges straight.
+The kitten chased the dog down the street.
+Pages bound in cloth make a book.
+Try to trace the fine lines of the painting.
+Women form less than half of the group.
+The zones merge in the central part of town.
+A gem in the rough needs work to polish.
+Code is used when secrets are sent.
+Most of the news is easy for us to hear.
+He used the lathe to make brass objects.
+The vane on top of the pole revolved in the wind.
+Mince pie is a dish served to children.
+The clan gathered on each dull night.
+Let it burn, it gives us warmth and comfort.
+A castle built from sand fails to endure.
+A child's wit saved the day for us.
+Tack the strip of carpet to the worn floor.
+Next Tuesday we must vote.
+Pour the stew from the pot into the plate.
+Each penny shone like new.
+The man went to the woods to gather sticks.
+The dirt piles were lines along the road.
+The logs fell and tumbled into the clear stream.
+Just hoist it up and take it away.
+A ripe plum is fit for a king's palate.
+Our plans right now are hazy.
+Brass rings are sold by these natives.
+It takes a good trap to capture a bear.
+Feed the white mouse some flower seeds.
+The thaw came early and freed the stream.
+He took the lead and kept it the whole distance.
+The key you designed will fit the lock.
+Plead to the council to free the poor thief.
+Better hash is made of rare beef.
+This plank was made for walking on.
+The lake sparkled in the red hot sun.
+He crawled with care along the ledge.
+Tend the sheep while the dog wanders.
+It takes a lot of help to finish these.
+Mark the spot with a sign painted red.
+Take two shares as a fair profit.
+The fur of cats goes by many names.
+North winds bring colds and fevers.
+He asks no person to vouch for him.
+Go now and come here later.
+A sash of gold silk will trim her dress.
+Soap can wash most dirt away.
+That move means the game is over.
+He wrote down a long list of items.
+A siege will crack the strong defense.
+Grape juice and water mix well.
+Roads are paved with sticky tar.
+Fake stones shine but cost little.
+The drip of the rain made a pleasant sound.
+Smoke poured out of every crack.
+Serve the hot rum to the tired heroes.
+Much of the story makes good sense.
+The sun came up to light the eastern sky.
+Heave the line over the port side.
+A lathe cuts and trims any wood.
+It's a dense crowd in two distinct ways.
+His hip struck the knee of the next player.
+The stale smell of old beer lingers.
+The desk was firm on the shaky floor.
+It takes heat to bring out the odor.
+Beef is scarcer than some lamb.
+Raise the sail and steer the ship northward.
+A cone costs five cents on Mondays.
+A pod is what peas always grow in.
+Jerk the dart from the cork target.
+No cement will hold hard wood.
+We now have a new base for shipping.
+A list of names is carved around the base.
+The sheep were led home by a dog.
+Three for a dime, the young peddler cried.
+The sense of smell is better than that of touch.
+No hardship seemed to keep him sad.
+Grace makes up for lack of beauty.
+Nudge gently but wake her now.
+The news struck doubt into restless minds.
+Once we stood beside the shore.
+A chink in the wall allowed a draft to blow.
+Fasten two pins on each side.
+A cold dip restores health and zest.
+He takes the oath of office each March.
+The sand drifts over the sill of the old house.
+The point of the steel pen was bent and twisted.
+There is a lag between thought and act.
+Seed is needed to plant the spring corn.
+Draw the chart with heavy black lines.
+The boy owed his pal thirty cents.
+The chap slipped into the crowd and was lost.
+Hats are worn to tea and not to dinner.
+The ramp led up to the wide highway.
+Beat the dust from the rug onto the lawn.
+Say it slowly but make it ring clear.
+The straw nest housed five robins.
+Screen the porch with woven straw mats.
+This horse will nose his way to the finish.
+The dry wax protects the deep scratch.
+He picked up the dice for a second roll.
+These coins will be needed to pay his debt.
+The nag pulled the frail cart along.
+Twist the valve and release hot steam.
+The vamp of the shoe had a gold buckle.
+The smell of burned rags itches my nose.
+New pants lack cuffs and pockets.
+The marsh will freeze when cold enough.
+They slice the sausage thin with a knife.
+The bloom of the rose lasts a few days.
+A gray mare walked before the colt.
+Breakfast buns are fine with a hot drink.
+Bottles hold four kinds of rum.
+The man wore a feather in his felt hat.
+He wheeled the bike past the winding road.
+Drop the ashes on the worn old rug.
+The desk and both chairs were painted tan.
+Throw out the used paper cup and plate.
+A clean neck means a neat collar.
+The couch cover and hall drapes were blue.
+The stems of the tall glasses cracked and broke.
+The wall phone rang loud and often.
+The clothes dried on a thin wooden rack.
+Turn on the lantern which gives us light.
+The cleat sank deeply into the soft turf.
+The bills were mailed promptly on the tenth of the month.
+To have is better than to wait and hope.
+The price is fair for a good antique clock.
+The music played on while they talked.
+Dispense with a vest on a day like this.
+The bunch of grapes was pressed into wine.
+He sent the figs, but kept the ripe cherries.
+The hinge on the door creaked with old age.
+The screen before the fire kept in the sparks.
+Fly by night, and you waste little time.
+Thick glasses helped him read the print.
+Birth and death mark the limits of life.
+The chair looked strong but had no bottom.
+The kite flew wildly in the high wind.
+A fur muff is stylish once more.
+The tin box held priceless stones.
+We need an end of all such matter.
+The case was puzzling to the old and wise.
+The bright lanterns were gay on the dark lawn.
+We don't get much money but we have fun.
+The youth drove with zest, but little skill.
+Five years he lived with a shaggy dog.
+A fence cuts through the corner lot.
+The way to save money is not to spend much.
+Shut the hatch before the waves push it in.
+The odor of spring makes young hearts jump.
+Crack the walnut with your sharp side teeth.
+He offered proof in the form of a large chart.
+Send the stuff in a thick paper bag.
+A quart of milk is water for the most part.
+They told wild tales to frighten him.
+The three story house was built of stone.
+In the rear of the ground floor was a large passage.
+A man in a blue sweater sat at the desk.
+Oats are a food eaten by horse and man.
+Their eyelids droop for want of sleep.
+A sip of tea revives his tired friend.
+There are many ways to do these things.
+Tuck the sheet under the edge of the mat.
+A force equal to that would move the earth.
+We like to see clear weather.
+The work of the tailor is seen on each side.
+Take a chance and win a china doll.
+Shake the dust from your shoes, stranger.
+She was kind to sick old people.
+The square wooden crate was packed to be shipped.
+The dusty bench stood by the stone wall.
+We dress to suit the weather of most days.
+Smile when you say nasty words.
+A bowl of rice is free with chicken stew.
+The water in this well is a source of good health.
+Take shelter in this tent, but keep still.
+That guy is the writer of a few banned books.
+The little tales they tell are false.
+The door was barred, locked, and bolted as well.
+Ripe pears are fit for a queen's table.
+A big wet stain was on the round carpet.
+The kite dipped and swayed, but stayed aloft.
+The pleasant hours fly by much too soon.
+The room was crowded with a wild mob.
+This strong arm shall shield your honor.
+She blushed when he gave her a white orchid.
+The beetle droned in the hot June sun.
+Press the pedal with your left foot.
+Neat plans fail without luck.
+The black trunk fell from the landing.
+The bank pressed for payment of the debt.
+The theft of the pearl pin was kept secret.
+Shake hands with this friendly child.
+The vast space stretched into the far distance.
+A rich farm is rare in this sandy waste.
+His wide grin earned many friends.
+Flax makes a fine brand of paper.
+Hurdle the pit with the aid of a long pole.
+A strong bid may scare your partner stiff.
+Even a just cause needs power to win.
+Peep under the tent and see the clowns.
+The leaf drifts along with a slow spin.
+Cheap clothes are flashy but don't last.
+A thing of small note can cause despair.
+Flood the mails with requests for this book.
+A thick coat of black paint covered all.
+The pencil was cut to be sharp at both ends.
+Those last words were a strong statement.
+He wrote his name boldly at the top of the sheet.
+Dill pickles are sour but taste fine.
+Down that road is the way to the grain farmer.
+Either mud or dust are found at all times.
+The best method is to fix it in place with clips.
+If you mumble your speech will be lost.
+At night the alarm roused him from a deep sleep.
+Read just what the meter says.
+Fill your pack with bright trinkets for the poor.
+The small red neon lamp went out.
+Clams are small, round, soft, and tasty.
+The fan whirled its round blades softly.
+The line where the edges join was clean.
+Breathe deep and smell the piny air.
+It matters not if he reads these words or those.
+A brown leather bag hung from its strap.
+A toad and a frog are hard to tell apart.
+A white silk jacket goes with any shoes.
+A break in the dam almost caused a flood.
+Paint the sockets in the wall dull green.
+The child crawled into the dense grass.
+Bribes fail where honest men work.
+Trample the spark, else the flames will spread.
+The hilt of the sword was carved with fine designs.
+A round hole was drilled through the thin board.
+Footprints showed the path he took up the beach.
+She was waiting at my front lawn.
+A vent near the edge brought in fresh air.
+Prod the old mule with a crooked stick.
+It is a band of steel three inches wide.
+The pipe ran almost the length of the ditch.
+It was hidden from sight by a mass of leaves and shrubs.
+The weight of the package was seen on the high scale.
+Wake and rise, and step into the green outdoors.
+The green light in the brown box flickered.
+The brass tube circled the high wall.
+The lobes of her ears were pierced to hold rings.
+Hold the hammer near the end to drive the nail.
+Next Sunday is the twelfth of the month.
+Every word and phrase he speaks is true.
+He put his last cartridge into the gun and fired.
+They took their kids from the public school.
+Drive the screw straight into the wood.
+Keep the hatch tight and the watch constant.
+Sever the twine with a quick snip of the knife.
+Paper will dry out when wet.
+Slide the catch back and open the desk.
+Help the weak to preserve their strength.
+A sullen smile gets few friends.
+Stop whistling and watch the boys march.
+Jerk the cord, and out tumbles the gold.
+Slide the tray across the glass top.
+The cloud moved in a stately way and was gone.
+Light maple makes for a swell room.
+Set the piece here and say nothing.
+Dull stories make her laugh.
+A stiff cord will do to fasten your shoe.
+Get the trust fund to the bank early.
+Choose between the high road and the low.
+A plea for funds seems to come again.
+He lent his coat to the tall gaunt stranger.
+There is a strong chance it will happen once more.
+The duke left the park in a silver coach.
+Greet the new guests and leave quickly.
+When the frost has come it is time for turkey.
+Sweet words work better than fierce.
+A thin stripe runs down the middle.
+A six comes up more often than a ten.
+Lush fern grow on the lofty rocks.
+The ram scared the school children off.
+The team with the best timing looks good.
+The farmer swapped his horse for a brown ox.
+Sit on the perch and tell the others what to do.
+A steep trail is painful for our feet.
+The early phase of life moves fast.
+Green moss grows on the northern side.
+Tea in thin china has a sweet taste.
+Pitch the straw through the door of the stable.
+The latch on the back gate needed a nail.
+The goose was brought straight from the old market.
+The sink is the thing in which we pile dishes.
+A whiff of it will cure the most stubborn cold.
+The facts don't always show who is right.
+She flaps her cape as she parades the street.
+The loss of the cruiser was a blow to the fleet.
+Loop the braid to the left and then over.
+Plead with the lawyer to drop the lost cause.
+Calves thrive on tender spring grass.
+Post no bills on this office wall.
+Tear a thin sheet from the yellow pad.
+A cruise in warm waters in a sleek yacht is fun.
+A streak of color ran down the left edge.
+It was done before the boy could see it.
+Crouch before you jump or miss the mark.
+Pack the kits and don't forget the salt.
+The square peg will settle in the round hole.
+Fine soap saves tender skin.
+Poached eggs and tea must suffice.
+Bad nerves are jangled by a door slam.
+Ship maps are different from those for planes.
+Dimes showered down from all sides.
+They sang the same tunes at each party.
+The sky in the west is tinged with orange red.
+The pods of peas ferment in bare fields.
+The horse balked and threw the tall rider.
+The hitch between the horse and cart broke.
+Pile the coal high in the shed corner.
+A gold vase is both rare and costly.
+The knife was hung inside its bright sheath.
+The rarest spice comes from the far East.
+The roof should be tilted at a sharp slant.
+A smatter of French is worse than none.
+The mule trod the treadmill day and night.
+The aim of the contest is to raise a great fund.
+To send it now in large amounts is bad.
+There is a fine hard tang in salty air.
+Cod is the main business of the north shore.
+The slab was hewn from heavy blocks of slate.
+Dunk the stale biscuits into strong drink.
+Hang tinsel from both branches.
+Cap the jar with a tight brass cover.
+The poor boy missed the boat again.
+Be sure to set the lamp firmly in the hole.
+Pick a card and slip it under the pack.
+A round mat will cover the dull spot.
+The first part of the plan needs changing.
+A good book informs of what we ought to know.
+The mail comes in three batches per day.
+You cannot brew tea in a cold pot.
+Dots of light betrayed the black cat.
+Put the chart on the mantel and tack it down.
+The night shift men rate extra pay.
+The red paper brightened the dim stage.
+See the player scoot to third base.
+Slide the bill between the two leaves.
+Many hands help get the job done.
+We don't like to admit our small faults.
+No doubt about the way the wind blows.
+Dig deep in the earth for pirate's gold.
+The steady drip is worse than a drenching rain.
+A flat pack takes less luggage space.
+Green ice frosted the punch bowl.
+A stuffed chair slipped from the moving van.
+The stitch will serve but needs to be shortened.
+A thin book fits in the side pocket.
+The gloss on top made it unfit to read.
+The hail pattered on the burnt brown grass.
+Seven seals were stamped on great sheets.
+Our troops are set to strike heavy blows.
+The store was jammed before the sale could start.
+It was a bad error on the part of the new judge.
+One step more and the board will collapse.
+Take the match and strike it against your shoe.
+The pot boiled, but the contents failed to jell.
+The baby puts his right foot in his mouth.
+The bombs left most of the town in ruins.
+Stop and stare at the hard working man.
+The streets are narrow and full of sharp turns.
+The pup jerked the leash as he saw a feline shape.
+Open your book to the first page.
+Fish evade the net and swim off.
+Dip the pail once and let it settle.
+Will you please answer that phone.
+The big red apple fell to the ground.
+The curtain rose and the show was on.
+The young prince became heir to the throne.
+He sent the boy on a short errand.
+Leave now and you will arrive on time.
+The corner store was robbed last night.
+A gold ring will please most any girl.
+The long journey home took a year.
+She saw a cat in the neighbor's house.
+A pink shell was found on the sandy beach.
+Small children came to see him.
+The grass and bushes were wet with dew.
+The blind man counted his old coins.
+A severe storm tore down the barn.
+She called his name many times.
+When you hear the bell, come quickly.
\ No newline at end of file
diff --git a/extensions/silero_tts/script.py b/extensions/silero_tts/script.py
index 3ecd5bd9..31677eca 100644
--- a/extensions/silero_tts/script.py
+++ b/extensions/silero_tts/script.py
@@ -1,3 +1,5 @@
+import html
+import random
 import time
 from pathlib import Path
 
@@ -5,7 +7,7 @@ import gradio as gr
 import torch
 
 from extensions.silero_tts import tts_preprocessor
-from modules import chat, shared
+from modules import chat, shared, ui_chat
 from modules.utils import gradio
 
 torch._C._jit_set_profiling_mode(False)
@@ -106,6 +108,7 @@ def history_modifier(history):
 
 def output_modifier(string, state):
     global model, current_params, streaming_state
+
     for i in params:
         if params[i] != current_params[i]:
             model = load_model()
@@ -116,7 +119,7 @@ def output_modifier(string, state):
         return string
 
     original_string = string
-    string = tts_preprocessor.preprocess(string)
+    string = tts_preprocessor.preprocess(html.unescape(string))
 
     if string == '':
         string = '*Empty reply, try regenerating*'
@@ -140,6 +143,35 @@ def setup():
     model = load_model()
 
 
+def random_sentence():
+    with open(Path("extensions/silero_tts/harvard_sentences.txt")) as f:
+        return random.choice(list(f))
+
+
+def voice_preview(preview_text):
+    global model, current_params, streaming_state
+
+    for i in params:
+        if params[i] != current_params[i]:
+            model = load_model()
+            current_params = params.copy()
+            break
+
+    string = tts_preprocessor.preprocess(preview_text or random_sentence())
+
+    output_file = Path('extensions/silero_tts/outputs/voice_preview.wav')
+    prosody = f"<prosody rate=\"{params['voice_speed']}\" pitch=\"{params['voice_pitch']}\">"
+    silero_input = f'<speak>{prosody}{xmlesc(string)}</prosody></speak>'
+    model.save_wav(ssml_text=silero_input, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))
+
+    return f'<audio src="file/{output_file.as_posix()}?{int(time.time())}" controls autoplay></audio>'
+
+
+def custom_css():
+    path_to_css = Path(__file__).parent.resolve() / 'style.css'
+    return open(path_to_css, 'r').read()
+
+
 def ui():
     # Gradio elements
     with gr.Accordion("Silero TTS"):
@@ -153,31 +185,33 @@ def ui():
             v_pitch = gr.Dropdown(value=params['voice_pitch'], choices=voice_pitches, label='Voice pitch')
             v_speed = gr.Dropdown(value=params['voice_speed'], choices=voice_speeds, label='Voice speed')
 
+        with gr.Row():
+            preview_text = gr.Text(show_label=False, placeholder="Preview text", elem_id="silero_preview_text")
+            preview_play = gr.Button("Preview")
+            preview_audio = gr.HTML(visible=False)
+
         with gr.Row():
             convert = gr.Button('Permanently replace audios with the message texts')
             convert_cancel = gr.Button('Cancel', visible=False)
             convert_confirm = gr.Button('Confirm (cannot be undone)', variant="stop", visible=False)
 
-        gr.Markdown('[Click here for Silero audio samples](https://oobabooga.github.io/silero-samples/index.html)')
+    # Convert history with confirmation
+    convert_arr = [convert_confirm, convert, convert_cancel]
+    convert.click(lambda: [gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)], None, convert_arr)
+    convert_confirm.click(
+        lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, convert_arr).then(
+        remove_tts_from_history, gradio('history'), gradio('history')).then(
+        chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
+        chat.redraw_html, gradio(ui_chat.reload_arr), gradio('display'))
 
-    if shared.is_chat():
-        # Convert history with confirmation
-        convert_arr = [convert_confirm, convert, convert_cancel]
-        convert.click(lambda: [gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)], None, convert_arr)
-        convert_confirm.click(
-            lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, convert_arr).then(
-            remove_tts_from_history, gradio('history'), gradio('history')).then(
-            chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
-            chat.redraw_html, shared.reload_inputs, gradio('display'))
+    convert_cancel.click(lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, convert_arr)
 
-        convert_cancel.click(lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, convert_arr)
-
-        # Toggle message text in history
-        show_text.change(
-            lambda x: params.update({"show_text": x}), show_text, None).then(
-            toggle_text_in_history, gradio('history'), gradio('history')).then(
-            chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
-            chat.redraw_html, shared.reload_inputs, gradio('display'))
+    # Toggle message text in history
+    show_text.change(
+        lambda x: params.update({"show_text": x}), show_text, None).then(
+        toggle_text_in_history, gradio('history'), gradio('history')).then(
+        chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
+        chat.redraw_html, gradio(ui_chat.reload_arr), gradio('display'))
 
     # Event functions to update the parameters in the backend
     activate.change(lambda x: params.update({"activate": x}), activate, None)
@@ -185,3 +219,7 @@ def ui():
     voice.change(lambda x: params.update({"speaker": x}), voice, None)
     v_pitch.change(lambda x: params.update({"voice_pitch": x}), v_pitch, None)
     v_speed.change(lambda x: params.update({"voice_speed": x}), v_speed, None)
+
+    # Play preview
+    preview_text.submit(voice_preview, preview_text, preview_audio)
+    preview_play.click(voice_preview, preview_text, preview_audio)
diff --git a/extensions/silero_tts/style.css b/extensions/silero_tts/style.css
new file mode 100644
index 00000000..2ab7aefb
--- /dev/null
+++ b/extensions/silero_tts/style.css
@@ -0,0 +1,8 @@
+.SDAP .hires_opts input[type="number"] {
+    width: 6em !important;
+}
+
+/* silero_tts preview */
+.form:has(> #silero_preview_text) {
+    min-width: 75%
+}
diff --git a/extensions/superbooga/script.py b/extensions/superbooga/script.py
index 5ef14d9d..06fe8ad3 100644
--- a/extensions/superbooga/script.py
+++ b/extensions/superbooga/script.py
@@ -4,7 +4,7 @@ import textwrap
 import gradio as gr
 from bs4 import BeautifulSoup
 
-from modules import chat, shared
+from modules import chat
 from modules.logging_colors import logger
 
 from .chromadb import add_chunks_to_collector, make_collector
@@ -96,7 +96,8 @@ def apply_settings(chunk_count, chunk_count_initial, time_weight):
 def custom_generate_chat_prompt(user_input, state, **kwargs):
     global chat_collector
 
-    history = state['history']
+    # get history as being modified when using regenerate.
+    history = kwargs['history']
 
     if state['mode'] == 'instruct':
         results = collector.get_sorted(user_input, n_results=params['chunk_count'])
@@ -142,8 +143,8 @@ def remove_special_tokens(string):
     return re.sub(pattern, '', string)
 
 
-def input_modifier(string):
-    if shared.is_chat():
+def input_modifier(string, state, is_chat=False):
+    if is_chat:
         return string
 
     # Find the user input
diff --git a/characters/instruction-following/Airoboros-v1.2.yaml b/instruction-templates/Airoboros-v1.2.yaml
similarity index 100%
rename from characters/instruction-following/Airoboros-v1.2.yaml
rename to instruction-templates/Airoboros-v1.2.yaml
diff --git a/characters/instruction-following/Alpaca.yaml b/instruction-templates/Alpaca.yaml
similarity index 100%
rename from characters/instruction-following/Alpaca.yaml
rename to instruction-templates/Alpaca.yaml
diff --git a/characters/instruction-following/Bactrian.yaml b/instruction-templates/Bactrian.yaml
similarity index 100%
rename from characters/instruction-following/Bactrian.yaml
rename to instruction-templates/Bactrian.yaml
diff --git a/characters/instruction-following/Baichuan Chat.yaml b/instruction-templates/Baichuan Chat.yaml
similarity index 100%
rename from characters/instruction-following/Baichuan Chat.yaml
rename to instruction-templates/Baichuan Chat.yaml
diff --git a/characters/instruction-following/Baize.yaml b/instruction-templates/Baize.yaml
similarity index 100%
rename from characters/instruction-following/Baize.yaml
rename to instruction-templates/Baize.yaml
diff --git a/characters/instruction-following/Bluemoon.yaml b/instruction-templates/Bluemoon.yaml
similarity index 100%
rename from characters/instruction-following/Bluemoon.yaml
rename to instruction-templates/Bluemoon.yaml
diff --git a/characters/instruction-following/ChatGLM.yaml b/instruction-templates/ChatGLM.yaml
similarity index 100%
rename from characters/instruction-following/ChatGLM.yaml
rename to instruction-templates/ChatGLM.yaml
diff --git a/characters/instruction-following/Chinese-Vicuna-Chat.yaml b/instruction-templates/Chinese-Vicuna-Chat.yaml
similarity index 100%
rename from characters/instruction-following/Chinese-Vicuna-Chat.yaml
rename to instruction-templates/Chinese-Vicuna-Chat.yaml
diff --git a/characters/instruction-following/Galactica Cite.yaml b/instruction-templates/Galactica Cite.yaml
similarity index 100%
rename from characters/instruction-following/Galactica Cite.yaml
rename to instruction-templates/Galactica Cite.yaml
diff --git a/characters/instruction-following/Galactica Finetuned.yaml b/instruction-templates/Galactica Finetuned.yaml
similarity index 100%
rename from characters/instruction-following/Galactica Finetuned.yaml
rename to instruction-templates/Galactica Finetuned.yaml
diff --git a/characters/instruction-following/Galactica Q.yaml b/instruction-templates/Galactica Q.yaml
similarity index 100%
rename from characters/instruction-following/Galactica Q.yaml
rename to instruction-templates/Galactica Q.yaml
diff --git a/characters/instruction-following/Galactica Summary.yaml b/instruction-templates/Galactica Summary.yaml
similarity index 100%
rename from characters/instruction-following/Galactica Summary.yaml
rename to instruction-templates/Galactica Summary.yaml
diff --git a/characters/instruction-following/Galactica Work.yaml b/instruction-templates/Galactica Work.yaml
similarity index 100%
rename from characters/instruction-following/Galactica Work.yaml
rename to instruction-templates/Galactica Work.yaml
diff --git a/characters/instruction-following/Galactica v2.yaml b/instruction-templates/Galactica v2.yaml
similarity index 100%
rename from characters/instruction-following/Galactica v2.yaml
rename to instruction-templates/Galactica v2.yaml
diff --git a/characters/instruction-following/Galactica.yaml b/instruction-templates/Galactica.yaml
similarity index 100%
rename from characters/instruction-following/Galactica.yaml
rename to instruction-templates/Galactica.yaml
diff --git a/characters/instruction-following/Gorilla.yaml b/instruction-templates/Gorilla.yaml
similarity index 100%
rename from characters/instruction-following/Gorilla.yaml
rename to instruction-templates/Gorilla.yaml
diff --git a/characters/instruction-following/Guanaco non-chat.yaml b/instruction-templates/Guanaco non-chat.yaml
similarity index 100%
rename from characters/instruction-following/Guanaco non-chat.yaml
rename to instruction-templates/Guanaco non-chat.yaml
diff --git a/characters/instruction-following/Guanaco-QLoRA.yaml b/instruction-templates/Guanaco-QLoRA.yaml
similarity index 100%
rename from characters/instruction-following/Guanaco-QLoRA.yaml
rename to instruction-templates/Guanaco-QLoRA.yaml
diff --git a/characters/instruction-following/Guanaco.yaml b/instruction-templates/Guanaco.yaml
similarity index 100%
rename from characters/instruction-following/Guanaco.yaml
rename to instruction-templates/Guanaco.yaml
diff --git a/characters/instruction-following/H2O-human_bot.yaml b/instruction-templates/H2O-human_bot.yaml
similarity index 100%
rename from characters/instruction-following/H2O-human_bot.yaml
rename to instruction-templates/H2O-human_bot.yaml
diff --git a/characters/instruction-following/H2O-prompt_answer.yaml b/instruction-templates/H2O-prompt_answer.yaml
similarity index 100%
rename from characters/instruction-following/H2O-prompt_answer.yaml
rename to instruction-templates/H2O-prompt_answer.yaml
diff --git a/characters/instruction-following/Hippogriff.yaml b/instruction-templates/Hippogriff.yaml
similarity index 100%
rename from characters/instruction-following/Hippogriff.yaml
rename to instruction-templates/Hippogriff.yaml
diff --git a/characters/instruction-following/INCITE-Chat.yaml b/instruction-templates/INCITE-Chat.yaml
similarity index 100%
rename from characters/instruction-following/INCITE-Chat.yaml
rename to instruction-templates/INCITE-Chat.yaml
diff --git a/characters/instruction-following/INCITE-Instruct.yaml b/instruction-templates/INCITE-Instruct.yaml
similarity index 100%
rename from characters/instruction-following/INCITE-Instruct.yaml
rename to instruction-templates/INCITE-Instruct.yaml
diff --git a/characters/instruction-following/KoAlpaca.yaml b/instruction-templates/KoAlpaca.yaml
similarity index 100%
rename from characters/instruction-following/KoAlpaca.yaml
rename to instruction-templates/KoAlpaca.yaml
diff --git a/characters/instruction-following/Koala.yaml b/instruction-templates/Koala.yaml
similarity index 100%
rename from characters/instruction-following/Koala.yaml
rename to instruction-templates/Koala.yaml
diff --git a/characters/instruction-following/LLaVA.yaml b/instruction-templates/LLaVA.yaml
similarity index 100%
rename from characters/instruction-following/LLaVA.yaml
rename to instruction-templates/LLaVA.yaml
diff --git a/characters/instruction-following/Llama-v2.yaml b/instruction-templates/Llama-v2.yaml
similarity index 100%
rename from characters/instruction-following/Llama-v2.yaml
rename to instruction-templates/Llama-v2.yaml
diff --git a/characters/instruction-following/MOSS.yaml b/instruction-templates/MOSS.yaml
similarity index 100%
rename from characters/instruction-following/MOSS.yaml
rename to instruction-templates/MOSS.yaml
diff --git a/characters/instruction-following/MPT-Chat.yaml b/instruction-templates/MPT-Chat.yaml
similarity index 100%
rename from characters/instruction-following/MPT-Chat.yaml
rename to instruction-templates/MPT-Chat.yaml
diff --git a/characters/instruction-following/Manticore Chat.yaml b/instruction-templates/Manticore Chat.yaml
similarity index 100%
rename from characters/instruction-following/Manticore Chat.yaml
rename to instruction-templates/Manticore Chat.yaml
diff --git a/characters/instruction-following/Metharme.yaml b/instruction-templates/Metharme.yaml
similarity index 100%
rename from characters/instruction-following/Metharme.yaml
rename to instruction-templates/Metharme.yaml
diff --git a/characters/instruction-following/Minotaur.yaml b/instruction-templates/Minotaur.yaml
similarity index 100%
rename from characters/instruction-following/Minotaur.yaml
rename to instruction-templates/Minotaur.yaml
diff --git a/characters/instruction-following/NewHope.yaml b/instruction-templates/NewHope.yaml
similarity index 100%
rename from characters/instruction-following/NewHope.yaml
rename to instruction-templates/NewHope.yaml
diff --git a/characters/instruction-following/Open Assistant.yaml b/instruction-templates/Open Assistant.yaml
similarity index 100%
rename from characters/instruction-following/Open Assistant.yaml
rename to instruction-templates/Open Assistant.yaml
diff --git a/characters/instruction-following/OpenBuddy.yaml b/instruction-templates/OpenBuddy.yaml
similarity index 100%
rename from characters/instruction-following/OpenBuddy.yaml
rename to instruction-templates/OpenBuddy.yaml
diff --git a/instruction-templates/OpenChat.yaml b/instruction-templates/OpenChat.yaml
new file mode 100644
index 00000000..3b84c226
--- /dev/null
+++ b/instruction-templates/OpenChat.yaml
@@ -0,0 +1,4 @@
+user: "GPT4 User:"
+bot: "GPT4 Assistant:"
+turn_template: "<|user|> <|user-message|><|end_of_turn|><|bot|> <|bot-message|><|end_of_turn|>"
+context: ""
diff --git a/instruction-templates/OpenOrca-Platypus2.yaml b/instruction-templates/OpenOrca-Platypus2.yaml
new file mode 100644
index 00000000..6cac0046
--- /dev/null
+++ b/instruction-templates/OpenOrca-Platypus2.yaml
@@ -0,0 +1,4 @@
+user: "### Instruction:"
+bot: "### Response:"
+turn_template: "<|user|> <|user-message|>\n\n<|bot|> <|bot-message|>\n\n"
+context: ""
diff --git a/characters/instruction-following/Orca Mini.yaml b/instruction-templates/Orca Mini.yaml
similarity index 100%
rename from characters/instruction-following/Orca Mini.yaml
rename to instruction-templates/Orca Mini.yaml
diff --git a/characters/instruction-following/RWKV-Raven.yaml b/instruction-templates/RWKV-Raven.yaml
similarity index 100%
rename from characters/instruction-following/RWKV-Raven.yaml
rename to instruction-templates/RWKV-Raven.yaml
diff --git a/characters/instruction-following/Samantha.yaml b/instruction-templates/Samantha.yaml
similarity index 100%
rename from characters/instruction-following/Samantha.yaml
rename to instruction-templates/Samantha.yaml
diff --git a/instruction-templates/StableBeluga2.yaml b/instruction-templates/StableBeluga2.yaml
new file mode 100644
index 00000000..cd5675f8
--- /dev/null
+++ b/instruction-templates/StableBeluga2.yaml
@@ -0,0 +1,4 @@
+user: "### User:"
+bot: "### Assistant:"
+turn_template: "<|user|>\n<|user-message|>\n\n<|bot|>\n<|bot-message|>\n\n"
+context: "### System:\nThis is a system prompt, please behave and help the user.\n\n"
diff --git a/characters/instruction-following/StableLM.yaml b/instruction-templates/StableLM.yaml
similarity index 100%
rename from characters/instruction-following/StableLM.yaml
rename to instruction-templates/StableLM.yaml
diff --git a/characters/instruction-following/StableVicuna.yaml b/instruction-templates/StableVicuna.yaml
similarity index 100%
rename from characters/instruction-following/StableVicuna.yaml
rename to instruction-templates/StableVicuna.yaml
diff --git a/characters/instruction-following/Starchat-Beta.yaml b/instruction-templates/Starchat-Beta.yaml
similarity index 100%
rename from characters/instruction-following/Starchat-Beta.yaml
rename to instruction-templates/Starchat-Beta.yaml
diff --git a/characters/instruction-following/Tulu.yaml b/instruction-templates/Tulu.yaml
similarity index 100%
rename from characters/instruction-following/Tulu.yaml
rename to instruction-templates/Tulu.yaml
diff --git a/characters/instruction-following/Vicuna-v0.yaml b/instruction-templates/Vicuna-v0.yaml
similarity index 100%
rename from characters/instruction-following/Vicuna-v0.yaml
rename to instruction-templates/Vicuna-v0.yaml
diff --git a/characters/instruction-following/Vicuna-v1.1.yaml b/instruction-templates/Vicuna-v1.1.yaml
similarity index 100%
rename from characters/instruction-following/Vicuna-v1.1.yaml
rename to instruction-templates/Vicuna-v1.1.yaml
diff --git a/characters/instruction-following/Vigogne-Chat.yaml b/instruction-templates/Vigogne-Chat.yaml
similarity index 100%
rename from characters/instruction-following/Vigogne-Chat.yaml
rename to instruction-templates/Vigogne-Chat.yaml
diff --git a/characters/instruction-following/Vigogne-Instruct.yaml b/instruction-templates/Vigogne-Instruct.yaml
similarity index 100%
rename from characters/instruction-following/Vigogne-Instruct.yaml
rename to instruction-templates/Vigogne-Instruct.yaml
diff --git a/characters/instruction-following/Wizard-Mega ShareGPT.yaml b/instruction-templates/Wizard-Mega ShareGPT.yaml
similarity index 100%
rename from characters/instruction-following/Wizard-Mega ShareGPT.yaml
rename to instruction-templates/Wizard-Mega ShareGPT.yaml
diff --git a/characters/instruction-following/Wizard-Mega WizardLM.yaml b/instruction-templates/Wizard-Mega WizardLM.yaml
similarity index 100%
rename from characters/instruction-following/Wizard-Mega WizardLM.yaml
rename to instruction-templates/Wizard-Mega WizardLM.yaml
diff --git a/characters/instruction-following/Wizard-Mega.yaml b/instruction-templates/Wizard-Mega.yaml
similarity index 100%
rename from characters/instruction-following/Wizard-Mega.yaml
rename to instruction-templates/Wizard-Mega.yaml
diff --git a/characters/instruction-following/Ziya.yaml b/instruction-templates/Ziya.yaml
similarity index 100%
rename from characters/instruction-following/Ziya.yaml
rename to instruction-templates/Ziya.yaml
diff --git a/js/main.js b/js/main.js
new file mode 100644
index 00000000..e409cc3d
--- /dev/null
+++ b/js/main.js
@@ -0,0 +1,93 @@
+let main_parent = document.getElementById('chat-tab').parentNode;
+let extensions = document.getElementById('extensions');
+
+main_parent.childNodes[0].classList.add("header_bar");
+main_parent.style = "padding: 0; margin: 0";
+main_parent.parentNode.parentNode.style = "padding: 0";
+
+document.querySelector('.header_bar').addEventListener('click', function(event) {
+    if (event.target.tagName === 'BUTTON') {
+        const buttonText = event.target.textContent.trim();
+
+        let chat_visible = (buttonText == 'Chat');
+        let default_visible = (buttonText == 'Default');
+        let notebook_visible = (buttonText == 'Notebook');
+
+        // Check if one of the generation tabs is visible
+        if (chat_visible || notebook_visible || default_visible) {
+            extensions.style.display = 'flex';
+            if (chat_visible) {
+                extensions.style.maxWidth = "800px";
+                extensions.style.padding = "0px";
+            } else {
+                extensions.style.maxWidth = "none";
+                extensions.style.padding = "15px";
+            }
+        } else {
+            extensions.style.display = 'none';
+        }
+    }
+});
+
+//------------------------------------------------
+// Add some scrollbars
+//------------------------------------------------
+const textareaElements = document.querySelectorAll('.add_scrollbar textarea');
+for(i = 0; i < textareaElements.length; i++) {
+    textareaElements[i].classList.remove('scroll-hide');
+    textareaElements[i].classList.add('pretty_scrollbar');
+    textareaElements[i].style.resize = "none";
+}
+
+//------------------------------------------------
+// Stop generation on Esc pressed
+//------------------------------------------------
+document.addEventListener("keydown", function(event) {
+  if (event.key === "Escape") {
+    // Find the element with id 'stop' and click it
+    var stopButton = document.getElementById("stop");
+    if (stopButton) {
+      stopButton.click();
+    }
+  }
+});
+
+//------------------------------------------------
+// Chat scrolling
+//------------------------------------------------
+const targetElement = document.getElementById('chat').parentNode.parentNode.parentNode;
+
+// Create a MutationObserver instance
+const observer = new MutationObserver(function(mutations) {
+  mutations.forEach(function(mutation) {
+    let childElement = targetElement.childNodes[2].childNodes[0].childNodes[1];
+    childElement.scrollTop = childElement.scrollHeight;
+  });
+});
+
+// Configure the observer to watch for changes in the subtree and attributes
+const config = {
+  childList: true,
+  subtree: true,
+  characterData: true,
+  attributeOldValue: true,
+  characterDataOldValue: true
+};
+
+// Start observing the target element
+observer.observe(targetElement, config);
+
+//------------------------------------------------
+// Improve the looks of the chat input field
+//------------------------------------------------
+document.getElementById('chat-input').parentNode.style.background = 'transparent';
+document.getElementById('chat-input').parentNode.style.border = 'none';
+
+//------------------------------------------------
+// Remove some backgrounds
+//------------------------------------------------
+const noBackgroundelements = document.querySelectorAll('.no-background');
+for(i = 0; i < noBackgroundelements.length; i++) {
+    noBackgroundelements[i].parentNode.style.border = 'none';
+    noBackgroundelements[i].parentNode.parentNode.parentNode.style.alignItems = 'center';
+}
diff --git a/js/save_files.js b/js/save_files.js
new file mode 100644
index 00000000..d5b22c4b
--- /dev/null
+++ b/js/save_files.js
@@ -0,0 +1,40 @@
+// Functions for downloading JSON files
+function getCurrentTimestamp() {
+    const now = new Date();
+    const timezoneOffset = now.getTimezoneOffset() * 60000; // Convert to milliseconds
+    const localTime = new Date(now.getTime() - timezoneOffset);
+    const formattedTimestamp = localTime.toISOString().replace(/[-:]/g, '').slice(0, 15);
+    return formattedTimestamp;
+}
+
+function saveFile(contents, filename) {
+    const element = document.createElement('a');
+    element.setAttribute('href', 'data:text/plain;charset=utf-8,' + encodeURIComponent(contents));
+    element.setAttribute('download', filename);
+    element.style.display = 'none';
+    document.body.appendChild(element);
+    element.click();
+    document.body.removeChild(element);
+}
+
+function saveHistory(history, character, mode) {
+    let path = null;
+
+    if (['chat', 'chat-instruct'].includes(mode) && character && character.trim() !== '') {
+        path = `history_${character}_${getCurrentTimestamp()}.json`;
+    } else {
+        try {
+            path = `history_${mode}_${getCurrentTimestamp()}.json`;
+        } catch (error) {
+            path = `history_${getCurrentTimestamp()}.json`;
+        }
+    }
+    saveFile(history, path);
+}
+
+function saveSession(session) {
+    let path = null;
+
+    path = `session_${getCurrentTimestamp()}.json`;
+    saveFile(session, path);
+}
diff --git a/js/show_controls.js b/js/show_controls.js
new file mode 100644
index 00000000..83bb6c02
--- /dev/null
+++ b/js/show_controls.js
@@ -0,0 +1,18 @@
+const belowChatInput = document.querySelectorAll("#chat-tab > div > :nth-child(n+3), #extensions");
+const chatParent = document.getElementById("chat").parentNode;
+
+function toggle_controls(value) {
+    if (value) {
+        belowChatInput.forEach(element => {
+          element.style.display = "inherit";
+        });
+
+        chatParent.classList.remove("bigchat");
+    } else {
+        belowChatInput.forEach(element => {
+          element.style.display = "none";
+        });
+
+        chatParent.classList.add("bigchat");
+    }
+}
diff --git a/js/switch_tabs.js b/js/switch_tabs.js
new file mode 100644
index 00000000..56279193
--- /dev/null
+++ b/js/switch_tabs.js
@@ -0,0 +1,43 @@
+let chat_tab = document.getElementById('chat-tab');
+let main_parent = chat_tab.parentNode;
+
+function scrollToTop() {
+    window.scrollTo({
+        top: 0,
+        // behavior: 'smooth'
+    });
+}
+
+function switch_to_chat() {
+    let chat_tab_button = main_parent.childNodes[0].childNodes[1];
+    chat_tab_button.click();
+    scrollToTop();
+}
+
+function switch_to_default() {
+    let default_tab_button = main_parent.childNodes[0].childNodes[4];
+    default_tab_button.click();
+    scrollToTop();
+}
+
+function switch_to_notebook() {
+    let notebook_tab_button = main_parent.childNodes[0].childNodes[7];
+    notebook_tab_button.click();
+    scrollToTop();
+}
+
+function switch_to_generation_parameters() {
+    let parameters_tab_button = main_parent.childNodes[0].childNodes[10];
+    let generation_tab_button = document.getElementById('character-menu').parentNode.parentNode.parentNode.parentNode.parentNode.parentNode.childNodes[0].childNodes[1];
+    parameters_tab_button.click();
+    generation_tab_button.click();
+    scrollToTop();
+}
+
+function switch_to_character() {
+    let parameters_tab_button = main_parent.childNodes[0].childNodes[10];
+    let character_tab_button = document.getElementById('character-menu').parentNode.parentNode.parentNode.parentNode.parentNode.parentNode.childNodes[0].childNodes[4];
+    parameters_tab_button.click();
+    character_tab_button.click();
+    scrollToTop();
+}
diff --git a/models/config.yaml b/models/config.yaml
index 0c1027c0..61e128cd 100644
--- a/models/config.yaml
+++ b/models/config.yaml
@@ -5,11 +5,23 @@
 .*(gpt-j|gptj|gpt4all-j|malion-6b|pygway|pygmalion-6b|dolly-v1):
   model_type: 'gptj'
 .*(gpt-neox|koalpaca-polyglot|polyglot.*koalpaca|polyglot-ko|polyglot_ko|pythia|stablelm|incite|dolly-v2|polycoder|h2ogpt-oig|h2ogpt-oasst1|h2ogpt-gm):
-  model_type: 'gpt_neox'
+  model_type: 'gptneox'
 .*llama:
   model_type: 'llama'
 .*bloom:
   model_type: 'bloom'
+.*gpt2:
+  model_type: 'gpt2'
+.*falcon:
+  model_type: 'falcon'
+.*mpt:
+  model_type: 'mpt'
+.*(starcoder|starchat):
+  model_type: 'starcoder'
+.*dolly-v2:
+  model_type: 'dollyv2'
+.*replit:
+  model_type: 'replit'
 llama-65b-gptq-3bit:
   groupsize: 'None'
 .*(4bit|int4):
@@ -53,9 +65,11 @@ llama-65b-gptq-3bit:
 .*vicuna.*(1.1|1_1|1.3|1_3):
   mode: 'instruct'
   instruction_template: 'Vicuna-v1.1'
-.*wizard.*vicuna:
+.*vicuna.*(1.5|1_5):
   mode: 'instruct'
   instruction_template: 'Vicuna-v1.1'
+  truncation_length: 4096
+  rms_norm_eps: 5.0e-6  
 .*stable.*vicuna:
   mode: 'instruct'
   instruction_template: 'StableVicuna'
@@ -108,10 +122,6 @@ llama-65b-gptq-3bit:
   truncation_length: 4096
 .*stablelm-base:
   truncation_length: 4096
-.*wizardlm:
-  mode: 'instruct'
-  model_type: 'llama'
-  instruction_template: 'WizardLM'
 .*galactica.*finetuned:
   mode: 'instruct'
   instruction_template: 'Galactica Finetuned'
@@ -189,21 +199,12 @@ llama-65b-gptq-3bit:
 .*airoboros.*1.2:
   mode: 'instruct'
   instruction_template: 'Airoboros-v1.2'
-.*WizardLM-30B-V1.0:
-  mode: 'instruct'
-  instruction_template: 'Vicuna-v1.1'
-TheBloke_WizardLM-30B-GPTQ:
-  mode: 'instruct'
-  instruction_template: 'Vicuna-v1.1'
 .*alpa(cino|sta):
   mode: 'instruct'
   instruction_template: 'Alpaca'
 .*hippogriff:
   mode: 'instruct'
   instruction_template: 'Hippogriff'
-.*gpt4all-.*-snoozy:
-  mode: 'instruct'
-  instruction_template: 'WizardLM'
 .*lazarus:
   mode: 'instruct'
   instruction_template: 'Alpaca'
@@ -234,6 +235,7 @@ TheBloke_WizardLM-30B-GPTQ:
 .*starchat-beta:
   mode: 'instruct'
   instruction_template: 'Starchat-Beta'
+  custom_stopping_strings: '"<|end|>"'
 .*minotaur:
   mode: 'instruct'
   instruction_template: 'Minotaur'
@@ -266,7 +268,7 @@ TheBloke_WizardLM-30B-GPTQ:
   mode: 'instruct'
   instruction_template: 'Alpaca'
   truncation_length: 8192
-.*wizardlm-.*-v1.1:
+.*wizardlm:
   mode: 'instruct'
   instruction_template: 'Vicuna-v1.1'
 .*godzilla:
@@ -278,8 +280,28 @@ TheBloke_WizardLM-30B-GPTQ:
 .*llama-(2|v2).*chat:
   mode: 'instruct'
   instruction_template: 'Llama-v2'
-.*llama.*70b.*ggml.*\.bin:
+.*70b.*ggml.*\.bin:
   n_gqa: 8
 .*newhope:
   mode: 'instruct'
   instruction_template: 'NewHope'
+.*stablebeluga2:
+  mode: 'instruct'
+  instruction_template: 'StableBeluga2'
+  truncation_length: 4096
+  rms_norm_eps: 5.0e-6
+.*openchat:
+  mode: 'instruct'
+  instruction_template: 'OpenChat'
+.*falcon.*-instruct:
+  mode: 'instruct'
+.*(openorca-platypus2):
+  mode: 'instruct'
+  instruction_template: 'OpenOrca-Platypus2'
+  custom_stopping_strings: '"### Instruction:", "### Response:"'
+  rms_norm_eps: 5.0e-6
+.*codellama:
+  rope_freq_base: 1000000
+.*codellama.*instruct:
+  mode: 'instruct'
+  instruction_template: 'Llama-v2'
diff --git a/modules/AutoGPTQ_loader.py b/modules/AutoGPTQ_loader.py
index 0d41ac0a..987f5ba7 100644
--- a/modules/AutoGPTQ_loader.py
+++ b/modules/AutoGPTQ_loader.py
@@ -50,6 +50,7 @@ def load_quantized(model_name):
         'max_memory': get_max_memory_dict(),
         'quantize_config': quantize_config,
         'use_cuda_fp16': not shared.args.no_use_cuda_fp16,
+        'disable_exllama': shared.args.disable_exllama,
     }
 
     logger.info(f"The AutoGPTQ params are: {params}")
diff --git a/modules/GPTQ_loader.py b/modules/GPTQ_loader.py
index ddc5f9a5..bc528b18 100644
--- a/modules/GPTQ_loader.py
+++ b/modules/GPTQ_loader.py
@@ -1,6 +1,5 @@
 import inspect
 import re
-import sys
 from pathlib import Path
 
 import accelerate
@@ -11,26 +10,9 @@ from transformers import AutoConfig, AutoModelForCausalLM
 import modules.shared as shared
 from modules.logging_colors import logger
 
-sys.path.insert(0, str(Path("repositories/GPTQ-for-LLaMa")))
-
-try:
-    import llama_inference_offload
-except ImportError:
-    logger.error('Failed to load GPTQ-for-LLaMa')
-    logger.error('See https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md')
-    sys.exit(-1)
-
-try:
-    from modelutils import find_layers
-except ImportError:
-    from utils import find_layers
-
-try:
-    from quant import make_quant
-    is_triton = False
-except ImportError:
-    import quant
-    is_triton = True
+from gptq_for_llama import llama_inference_offload
+from gptq_for_llama.modelutils import find_layers
+from gptq_for_llama.quant import make_quant
 
 
 # This function is a replacement for the load_quant function in the
@@ -59,24 +41,21 @@ def _load_quant(model, checkpoint, wbits, groupsize=-1, faster_kernel=False, exc
         if name in layers:
             del layers[name]
 
-    if not is_triton:
-        gptq_args = inspect.getfullargspec(make_quant).args
+    gptq_args = inspect.getfullargspec(make_quant).args
 
-        make_quant_kwargs = {
-            'module': model,
-            'names': layers,
-            'bits': wbits,
-        }
-        if 'groupsize' in gptq_args:
-            make_quant_kwargs['groupsize'] = groupsize
-        if 'faster' in gptq_args:
-            make_quant_kwargs['faster'] = faster_kernel
-        if 'kernel_switch_threshold' in gptq_args:
-            make_quant_kwargs['kernel_switch_threshold'] = kernel_switch_threshold
+    make_quant_kwargs = {
+        'module': model,
+        'names': layers,
+        'bits': wbits,
+    }
+    if 'groupsize' in gptq_args:
+        make_quant_kwargs['groupsize'] = groupsize
+    if 'faster' in gptq_args:
+        make_quant_kwargs['faster'] = faster_kernel
+    if 'kernel_switch_threshold' in gptq_args:
+        make_quant_kwargs['kernel_switch_threshold'] = kernel_switch_threshold
 
-        make_quant(**make_quant_kwargs)
-    else:
-        quant.make_quant_linear(model, layers, wbits, groupsize)
+    make_quant(**make_quant_kwargs)
 
     del layers
     if checkpoint.endswith('.safetensors'):
@@ -85,18 +64,6 @@ def _load_quant(model, checkpoint, wbits, groupsize=-1, faster_kernel=False, exc
     else:
         model.load_state_dict(torch.load(checkpoint), strict=False)
 
-    if is_triton:
-        if shared.args.quant_attn:
-            quant.make_quant_attn(model)
-
-        if eval and shared.args.fused_mlp:
-            quant.make_fused_mlp(model)
-
-        if shared.args.warmup_autotune:
-            quant.autotune_warmup_linear(model, transpose=not eval)
-            if eval and shared.args.fused_mlp:
-                quant.autotune_warmup_fused(model)
-
     model.seqlen = 2048
     return model
 
diff --git a/modules/LoRA.py b/modules/LoRA.py
index 1350783f..10020552 100644
--- a/modules/LoRA.py
+++ b/modules/LoRA.py
@@ -17,6 +17,14 @@ def add_lora_to_model(lora_names):
         add_lora_transformers(lora_names)
 
 
+def get_lora_path(lora_name):
+    p = Path(lora_name)
+    if p.exists():
+        lora_name = p.parts[-1]
+
+    return Path(f"{shared.args.lora_dir}/{lora_name}")
+
+
 def add_lora_exllama(lora_names):
 
     try:
@@ -40,7 +48,7 @@ def add_lora_exllama(lora_names):
         if len(lora_names) > 1:
             logger.warning('ExLlama can only work with 1 LoRA at the moment. Only the first one in the list will be loaded.')
 
-        lora_path = Path(f"{shared.args.lora_dir}/{lora_names[0]}")
+        lora_path = get_lora_path(lora_names[0])
         lora_config_path = lora_path / "adapter_config.json"
         lora_adapter_path = lora_path / "adapter_model.bin"
 
@@ -81,7 +89,7 @@ def add_lora_autogptq(lora_names):
             inference_mode=True,
         )
 
-        lora_path = Path(f"{shared.args.lora_dir}/{lora_names[0]}")
+        lora_path = get_lora_path(lora_names[0])
         logger.info("Applying the following LoRAs to {}: {}".format(shared.model_name, ', '.join([lora_names[0]])))
         shared.model = get_gptq_peft_model(shared.model, peft_config, lora_path)
         shared.lora_names = [lora_names[0]]
@@ -101,7 +109,7 @@ def add_lora_transformers(lora_names):
     if len(removed_set) == 0 and len(prior_set) > 0:
         logger.info(f"Adding the LoRA(s) named {added_set} to the model...")
         for lora in added_set:
-            shared.model.load_adapter(Path(f"{shared.args.lora_dir}/{lora}"), lora)
+            shared.model.load_adapter(get_lora_path(lora), lora)
 
         return
 
@@ -123,9 +131,9 @@ def add_lora_transformers(lora_names):
                     params['device_map'] = {"base_model.model." + k: v for k, v in shared.model.hf_device_map.items()}
 
         logger.info("Applying the following LoRAs to {}: {}".format(shared.model_name, ', '.join(lora_names)))
-        shared.model = PeftModel.from_pretrained(shared.model, Path(f"{shared.args.lora_dir}/{lora_names[0]}"), adapter_name=lora_names[0], **params)
+        shared.model = PeftModel.from_pretrained(shared.model, get_lora_path(lora_names[0]), adapter_name=lora_names[0], **params)
         for lora in lora_names[1:]:
-            shared.model.load_adapter(Path(f"{shared.args.lora_dir}/{lora}"), lora)
+            shared.model.load_adapter(get_lora_path(lora), lora)
 
         shared.lora_names = lora_names
 
diff --git a/modules/RoPE.py b/modules/RoPE.py
new file mode 100644
index 00000000..c15616c6
--- /dev/null
+++ b/modules/RoPE.py
@@ -0,0 +1,18 @@
+def get_alpha_value(alpha, base):
+    '''
+    Gets alpha_value from alpha_value and rope_freq_base
+    '''
+    if base > 0:
+        return (base/10000.) ** (63/64.)
+    else:
+        return alpha
+
+
+def get_rope_freq_base(alpha, base):
+    '''
+    Gets rope_freq_base from alpha_value and rope_freq_base
+    '''
+    if base > 0:
+        return base
+    else:
+        return 10000 * alpha ** (64/63.)
diff --git a/modules/callbacks.py b/modules/callbacks.py
index 1fa95e47..e29e397d 100644
--- a/modules/callbacks.py
+++ b/modules/callbacks.py
@@ -24,6 +24,7 @@ class Stream(transformers.StoppingCriteria):
     def __call__(self, input_ids, scores) -> bool:
         if self.callback_func is not None:
             self.callback_func(input_ids[0])
+
         return False
 
 
diff --git a/modules/chat.py b/modules/chat.py
index 070f45a4..e9c2fe7c 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -1,9 +1,9 @@
 import base64
 import copy
 import functools
+import html
 import json
 import re
-from datetime import datetime
 from pathlib import Path
 
 import gradio as gr
@@ -176,9 +176,6 @@ def get_stopping_strings(state):
             f"\n{state['name2']}:"
         ]
 
-    if state['stop_at_newline']:
-        stopping_strings.append("\n")
-
     return stopping_strings
 
 
@@ -192,17 +189,18 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
         yield output
         return
 
-    # Defining some variables
     just_started = True
     visible_text = None
     stopping_strings = get_stopping_strings(state)
     is_stream = state['stream']
 
-    # Preparing the input
+    # Prepare the input
     if not any((regenerate, _continue)):
-        visible_text = text
+        visible_text = html.escape(text)
+
+        # Apply extensions
         text, visible_text = apply_extensions('chat_input', text, visible_text, state)
-        text = apply_extensions('input', text, state)
+        text = apply_extensions('input', text, state, is_chat=True)
 
         # *Is typing...*
         if loading_message:
@@ -212,6 +210,7 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
         if regenerate:
             output['visible'].pop()
             output['internal'].pop()
+
             # *Is typing...*
             if loading_message:
                 yield {'visible': output['visible'] + [[visible_text, shared.processing_message]], 'internal': output['internal']}
@@ -220,86 +219,64 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
             if loading_message:
                 yield {'visible': output['visible'][:-1] + [[visible_text, last_reply[1] + '...']], 'internal': output['internal']}
 
-    # Generating the prompt
+    # Generate the prompt
     kwargs = {
         '_continue': _continue,
         'history': output,
     }
-
     prompt = apply_extensions('custom_generate_chat_prompt', text, state, **kwargs)
     if prompt is None:
         prompt = generate_chat_prompt(text, state, **kwargs)
 
     # Generate
-    cumulative_reply = ''
-    for i in range(state['chat_generation_attempts']):
-        reply = None
-        for j, reply in enumerate(generate_reply(prompt + cumulative_reply, state, stopping_strings=stopping_strings, is_chat=True)):
-            reply = cumulative_reply + reply
+    reply = None
+    for j, reply in enumerate(generate_reply(prompt, state, stopping_strings=stopping_strings, is_chat=True)):
 
-            # Extract the reply
-            visible_reply = re.sub("(<USER>|<user>|{{user}})", state['name1'], reply)
+        # Extract the reply
+        visible_reply = re.sub("(<USER>|<user>|{{user}})", state['name1'], reply)
+        visible_reply = html.escape(visible_reply)
 
-            # We need this global variable to handle the Stop event,
-            # otherwise gradio gets confused
-            if shared.stop_everything:
-                output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state)
+        if shared.stop_everything:
+            output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True)
+            yield output
+            return
+
+        if just_started:
+            just_started = False
+            if not _continue:
+                output['internal'].append(['', ''])
+                output['visible'].append(['', ''])
+
+        if _continue:
+            output['internal'][-1] = [text, last_reply[0] + reply]
+            output['visible'][-1] = [visible_text, last_reply[1] + visible_reply]
+            if is_stream:
+                yield output
+        elif not (j == 0 and visible_reply.strip() == ''):
+            output['internal'][-1] = [text, reply.lstrip(' ')]
+            output['visible'][-1] = [visible_text, visible_reply.lstrip(' ')]
+            if is_stream:
                 yield output
-                return
 
-            if just_started:
-                just_started = False
-                if not _continue:
-                    output['internal'].append(['', ''])
-                    output['visible'].append(['', ''])
-
-            if _continue:
-                output['internal'][-1] = [text, last_reply[0] + reply]
-                output['visible'][-1] = [visible_text, last_reply[1] + visible_reply]
-                if is_stream:
-                    yield output
-            elif not (j == 0 and visible_reply.strip() == ''):
-                output['internal'][-1] = [text, reply.lstrip(' ')]
-                output['visible'][-1] = [visible_text, visible_reply.lstrip(' ')]
-                if is_stream:
-                    yield output
-
-        if reply in [None, cumulative_reply]:
-            break
-        else:
-            cumulative_reply = reply
-
-    output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state)
+    output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True)
     yield output
 
 
-def impersonate_wrapper(text, start_with, state):
+def impersonate_wrapper(text, state):
     if shared.model_name == 'None' or shared.model is None:
         logger.error("No model is loaded! Select one in the Model tab.")
         yield ''
         return
 
-    # Defining some variables
-    cumulative_reply = ''
     prompt = generate_chat_prompt('', state, impersonate=True)
     stopping_strings = get_stopping_strings(state)
 
     yield text + '...'
-    cumulative_reply = text
-    for i in range(state['chat_generation_attempts']):
-        reply = None
-        for reply in generate_reply(prompt + cumulative_reply, state, stopping_strings=stopping_strings, is_chat=True):
-            reply = cumulative_reply + reply
-            yield reply.lstrip(' ')
-            if shared.stop_everything:
-                return
-
-        if reply in [None, cumulative_reply]:
-            break
-        else:
-            cumulative_reply = reply
-
-    yield cumulative_reply.lstrip(' ')
+    reply = None
+    for reply in generate_reply(prompt + text, state, stopping_strings=stopping_strings, is_chat=True):
+        yield (text + reply).lstrip(' ')
+        if shared.stop_everything:
+            return
 
 
 def generate_chat_reply(text, state, regenerate=False, _continue=False, loading_message=True):
@@ -315,15 +292,15 @@ def generate_chat_reply(text, state, regenerate=False, _continue=False, loading_
 
 
 # Same as above but returns HTML for the UI
-def generate_chat_reply_wrapper(text, start_with, state, regenerate=False, _continue=False):
-    if start_with != '' and not _continue:
+def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
+    if state['start_with'] != '' and not _continue:
         if regenerate:
             text, state['history'] = remove_last_message(state['history'])
             regenerate = False
 
         _continue = True
         send_dummy_message(text, state)
-        send_dummy_reply(start_with, state)
+        send_dummy_reply(state['start_with'], state)
 
     for i, history in enumerate(generate_chat_reply(text, state, regenerate, _continue, loading_message=True)):
         yield chat_html_wrapper(history, state['name1'], state['name2'], state['mode'], state['chat_style']), history
@@ -336,29 +313,32 @@ def remove_last_message(history):
     else:
         last = ['', '']
 
-    return last[0], history
+    return html.unescape(last[0]), history
 
 
 def send_last_reply_to_input(history):
-    if len(history['internal']) > 0:
-        return history['internal'][-1][1]
+    if len(history['visible']) > 0:
+        return html.unescape(history['visible'][-1][1])
     else:
         return ''
 
 
 def replace_last_reply(text, state):
     history = state['history']
-    if len(history['visible']) > 0:
-        history['visible'][-1][1] = text
-        history['internal'][-1][1] = apply_extensions('input', text, state)
+
+    if len(text.strip()) == 0:
+        return history
+    elif len(history['visible']) > 0:
+        history['visible'][-1][1] = html.escape(text)
+        history['internal'][-1][1] = apply_extensions('input', text, state, is_chat=True)
 
     return history
 
 
 def send_dummy_message(text, state):
     history = state['history']
-    history['visible'].append([text, ''])
-    history['internal'].append([apply_extensions('input', text, state), ''])
+    history['visible'].append([html.escape(text), ''])
+    history['internal'].append([apply_extensions('input', text, state, is_chat=True), ''])
     return history
 
 
@@ -368,8 +348,8 @@ def send_dummy_reply(text, state):
         history['visible'].append(['', ''])
         history['internal'].append(['', ''])
 
-    history['visible'][-1][1] = text
-    history['internal'][-1][1] = apply_extensions('input', text, state)
+    history['visible'][-1][1] = html.escape(text)
+    history['internal'][-1][1] = apply_extensions('input', text, state, is_chat=True)
     return history
 
 
@@ -383,7 +363,7 @@ def clear_chat_log(state):
     if mode != 'instruct':
         if greeting != '':
             history['internal'] += [['<|BEGIN-VISIBLE-CHAT|>', greeting]]
-            history['visible'] += [['', apply_extensions('output', greeting, state)]]
+            history['visible'] += [['', apply_extensions('output', greeting, state, is_chat=True)]]
 
     return history
 
@@ -394,6 +374,9 @@ def redraw_html(history, name1, name2, mode, style, reset_cache=False):
 
 def save_history(history, path=None):
     p = path or Path('logs/exported_history.json')
+    if not p.parent.is_dir():
+        p.parent.mkdir(parents=True)
+
     with open(p, 'w', encoding='utf-8') as f:
         f.write(json.dumps(history, indent=4))
 
@@ -412,36 +395,30 @@ def load_history(file, history):
         return history
 
 
-def save_history_at_user_request(history, character, mode):
-    def make_timestamp_path(character=None):
-        return f"logs/{character or ''}{'_' if character else ''}{datetime.now().strftime('%Y%m%d-%H%M%S')}.json"
-
-    path = None
-    if mode in ['chat', 'chat-instruct'] and character not in ['', 'None', None]:
-        path = make_timestamp_path(character)
-    else:
-        # Try to use mode as the file name, otherwise just use the timestamp
-        try:
-            path = make_timestamp_path(mode.capitalize())
-        except:
-            path = make_timestamp_path()
-
-    return save_history(history, path)
-
-
 def save_persistent_history(history, character, mode):
     if mode in ['chat', 'chat-instruct'] and character not in ['', 'None', None] and not shared.args.multi_user:
-        save_history(history, path=Path(f'logs/{character}_persistent.json'))
+        save_history(history, path=Path(f'logs/persistent_{character}.json'))
 
 
 def load_persistent_history(state):
+    if shared.session_is_loading:
+        shared.session_is_loading = False
+        return state['history']
+
     if state['mode'] == 'instruct':
         return state['history']
 
     character = state['character_menu']
     greeting = replace_character_names(state['greeting'], state['name1'], state['name2'])
-    p = Path(f'logs/{character}_persistent.json')
-    if not shared.args.multi_user and character not in ['None', '', None] and p.exists():
+
+    should_load_history = (not shared.args.multi_user and character not in ['None', '', None])
+    old_p = Path(f'logs/{character}_persistent.json')
+    p = Path(f'logs/persistent_{character}.json')
+    if should_load_history and old_p.exists():
+        logger.warning(f"Renaming {old_p} to {p}")
+        old_p.rename(p)
+
+    if should_load_history and p.exists():
         f = json.loads(open(p, 'rb').read())
         if 'internal' in f and 'visible' in f:
             history = f
@@ -453,7 +430,7 @@ def load_persistent_history(state):
         history = {'internal': [], 'visible': []}
         if greeting != "":
             history['internal'] += [['<|BEGIN-VISIBLE-CHAT|>', greeting]]
-            history['visible'] += [['', apply_extensions('output', greeting, state)]]
+            history['visible'] += [['', apply_extensions('output', greeting, state, is_chat=True)]]
 
     return history
 
@@ -483,11 +460,11 @@ def load_character(character, name1, name2, instruct=False):
     picture = None
 
     # Deleting the profile picture cache, if any
-    if Path("cache/pfp_character.png").exists():
+    if Path("cache/pfp_character.png").exists() and not instruct:
         Path("cache/pfp_character.png").unlink()
 
     if character not in ['None', '', None]:
-        folder = 'characters' if not instruct else 'characters/instruction-following'
+        folder = 'characters' if not instruct else 'instruction-templates'
         picture = generate_pfp_cache(character)
         filepath = None
         for extension in ["yml", "yaml", "json"]:
@@ -522,9 +499,6 @@ def load_character(character, name1, name2, instruct=False):
             context = build_pygmalion_style_context(data)
             greeting_field = 'char_greeting'
 
-        if 'example_dialogue' in data:
-            context += f"{data['example_dialogue'].strip()}\n"
-
         if greeting_field in data:
             greeting = data[greeting_field]
 
@@ -535,7 +509,6 @@ def load_character(character, name1, name2, instruct=False):
         context = shared.settings['context']
         name2 = shared.settings['name2']
         greeting = shared.settings['greeting']
-        turn_template = shared.settings['turn_template']
 
     return name1, name2, picture, greeting, context, turn_template.replace("\n", r"\n")
 
@@ -585,6 +558,9 @@ def build_pygmalion_style_context(data):
     if 'world_scenario' in data and data['world_scenario'] != '':
         context += f"Scenario: {data['world_scenario']}\n"
 
+    if 'example_dialogue' in data and data['example_dialogue'] != '':
+        context += f"{data['example_dialogue'].strip()}\n"
+
     context = f"{context.strip()}\n"
     return context
 
diff --git a/modules/ctransformers_model.py b/modules/ctransformers_model.py
new file mode 100644
index 00000000..70ce92f5
--- /dev/null
+++ b/modules/ctransformers_model.py
@@ -0,0 +1,79 @@
+from ctransformers import AutoConfig, AutoModelForCausalLM
+
+from modules import shared
+from modules.callbacks import Iteratorize
+from modules.logging_colors import logger
+
+
+class CtransformersModel:
+    def __init__(self):
+        pass
+
+    @classmethod
+    def from_pretrained(cls, path):
+        result = cls()
+
+        config = AutoConfig.from_pretrained(
+            str(path),
+            threads=shared.args.threads if shared.args.threads != 0 else -1,
+            gpu_layers=shared.args.n_gpu_layers,
+            batch_size=shared.args.n_batch,
+            context_length=shared.args.n_ctx,
+            stream=True,
+            mmap=not shared.args.no_mmap,
+            mlock=shared.args.mlock
+        )
+
+        result.model = AutoModelForCausalLM.from_pretrained(
+            str(result.model_dir(path) if result.model_type_is_auto() else path),
+            model_type=(None if result.model_type_is_auto() else shared.args.model_type),
+            config=config
+        )
+
+        logger.info(f'Using ctransformers model_type: {result.model.model_type} for {result.model.model_path}')
+        return result, result
+
+    def model_type_is_auto(self):
+        return shared.args.model_type is None or shared.args.model_type == "Auto" or shared.args.model_type == "None"
+
+    def model_dir(self, path):
+        if path.is_file():
+            return path.parent
+
+        return path
+
+    def encode(self, string, **kwargs):
+        return self.model.tokenize(string)
+
+    def decode(self, ids):
+        return self.model.detokenize(ids)
+
+    def generate(self, prompt, state, callback=None):
+        prompt = prompt if type(prompt) is str else prompt.decode()
+        # ctransformers uses -1 for random seed
+        generator = self.model(
+            prompt=prompt,
+            max_new_tokens=state['max_new_tokens'],
+            temperature=state['temperature'],
+            top_p=state['top_p'],
+            top_k=state['top_k'],
+            repetition_penalty=state['repetition_penalty'],
+            last_n_tokens=state['repetition_penalty_range'],
+            seed=int(state['seed'])
+        )
+
+        output = ""
+        for token in generator:
+            if callback:
+                callback(token)
+
+            output += token
+
+        return output
+
+    def generate_with_streaming(self, *args, **kwargs):
+        with Iteratorize(self.generate, args, kwargs, callback=None) as generator:
+            reply = ''
+            for token in generator:
+                reply += token
+                yield reply
diff --git a/modules/exllama.py b/modules/exllama.py
index ecfb10a4..7df1d321 100644
--- a/modules/exllama.py
+++ b/modules/exllama.py
@@ -1,9 +1,11 @@
 from pathlib import Path
 
+import torch.nn.functional as F
 from torch import version as torch_version
 
-from modules import shared
+from modules import RoPE, shared
 from modules.logging_colors import logger
+from modules.models import clear_torch_cache
 from modules.text_generation import get_max_prompt_length
 
 try:
@@ -54,8 +56,8 @@ class ExllamaModel:
             config.set_auto_map(shared.args.gpu_split)
             config.gpu_peer_fix = True
 
-        if shared.args.alpha_value:
-            config.alpha_value = shared.args.alpha_value
+        if shared.args.alpha_value > 1 or shared.args.rope_freq_base > 0:
+            config.alpha_value = RoPE.get_alpha_value(shared.args.alpha_value, shared.args.rope_freq_base)
             config.calculate_rotary_embedding_base()
 
         if torch_version.hip:
@@ -78,6 +80,21 @@ class ExllamaModel:
         return result, result
 
     def generate_with_streaming(self, prompt, state):
+
+        # The cache batch size must be 2 for CFG and 1 otherwise
+        if state['guidance_scale'] == 1:
+            if self.cache.batch_size == 2:
+                del self.cache
+                clear_torch_cache()
+                self.cache = ExLlamaCache(self.model)
+                self.generator = ExLlamaGenerator(self.model, self.tokenizer, self.cache)
+        else:
+            if self.cache.batch_size == 1:
+                del self.cache
+                clear_torch_cache()
+                self.cache = ExLlamaCache(self.model, batch_size=2)
+                self.generator = ExLlamaGenerator(self.model, self.tokenizer, self.cache)
+
         self.generator.settings.temperature = state['temperature']
         self.generator.settings.top_p = state['top_p']
         self.generator.settings.top_k = state['top_k']
@@ -89,27 +106,72 @@ class ExllamaModel:
         else:
             self.generator.disallow_tokens(None)
 
-        self.generator.end_beam_search()
+        # Case 1: no CFG
+        if state['guidance_scale'] == 1:
+            self.generator.end_beam_search()
 
-        # Tokenizing the input
-        ids = self.generator.tokenizer.encode(prompt)
-        ids = ids[:, -get_max_prompt_length(state):]
+            # Tokenizing the input
+            ids = self.generator.tokenizer.encode(prompt, max_seq_len=self.model.config.max_seq_len)
+            ids = ids[:, -get_max_prompt_length(state):]
+            if state['auto_max_new_tokens']:
+                max_new_tokens = state['truncation_length'] - ids.shape[-1]
+            else:
+                max_new_tokens = state['max_new_tokens']
 
-        self.generator.gen_begin_reuse(ids)
-        initial_len = self.generator.sequence[0].shape[0]
-        has_leading_space = False
-        for i in range(state['max_new_tokens']):
-            token = self.generator.gen_single_token()
-            if i == 0 and self.generator.tokenizer.tokenizer.IdToPiece(int(token)).startswith('▁'):
-                has_leading_space = True
+            self.generator.gen_begin_reuse(ids)
+            initial_len = self.generator.sequence[0].shape[0]
+            has_leading_space = False
 
-            decoded_text = self.generator.tokenizer.decode(self.generator.sequence[0][initial_len:])
-            if has_leading_space:
-                decoded_text = ' ' + decoded_text
+            for i in range(max_new_tokens):
+                token = self.generator.gen_single_token()
+                if i == 0 and self.generator.tokenizer.tokenizer.IdToPiece(int(token)).startswith('▁'):
+                    has_leading_space = True
 
-            yield decoded_text
-            if token.item() == self.generator.tokenizer.eos_token_id or shared.stop_everything:
-                break
+                decoded_text = self.generator.tokenizer.decode(self.generator.sequence[0][initial_len:])
+                if has_leading_space:
+                    decoded_text = ' ' + decoded_text
+
+                yield decoded_text
+                if token.item() == self.generator.tokenizer.eos_token_id or shared.stop_everything:
+                    break
+
+        # Case 2: CFG
+        # Copied from https://github.com/turboderp/exllama/blob/master/example_cfg.py
+        else:
+            alpha = state['guidance_scale']
+            prompts = [prompt, state['negative_prompt'] or '']
+
+            ids, mask = self.tokenizer.encode(prompts, return_mask=True, max_seq_len=self.model.config.max_seq_len)
+            if state['auto_max_new_tokens']:
+                max_new_tokens = state['truncation_length'] - ids[0].shape[-1]
+            else:
+                max_new_tokens = state['max_new_tokens']
+
+            self.generator.gen_begin(ids, mask=mask)
+            initial_len = self.generator.sequence[0].shape[0]
+            has_leading_space = False
+
+            for i in range(max_new_tokens):
+                logits = self.model.forward(self.generator.sequence[:, -1:], self.cache, input_mask=mask)
+                self.generator.apply_rep_penalty(logits)
+
+                logits = F.log_softmax(logits, dim=-1)
+                logits_mixed = alpha * logits[0] + (1 - alpha) * logits[1]
+
+                token, _ = self.generator.sample_current(logits_mixed)
+                if i == 0 and self.generator.tokenizer.tokenizer.IdToPiece(int(token)).startswith('▁'):
+                    has_leading_space = True
+
+                decoded_text = self.generator.tokenizer.decode(self.generator.sequence[0][initial_len:])
+                if has_leading_space:
+                    decoded_text = ' ' + decoded_text
+
+                yield decoded_text
+                if token.item() == self.tokenizer.eos_token_id or shared.stop_everything:
+                    break
+
+                batch_token = token.repeat(2, 1)
+                self.generator.gen_accept_token(batch_token)
 
     def generate(self, prompt, state):
         output = ''
@@ -119,7 +181,7 @@ class ExllamaModel:
         return output
 
     def encode(self, string, **kwargs):
-        return self.tokenizer.encode(string)
+        return self.tokenizer.encode(string, max_seq_len=self.model.config.max_seq_len)
 
     def decode(self, string, **kwargs):
         return self.tokenizer.decode(string)[0]
diff --git a/modules/exllama_hf.py b/modules/exllama_hf.py
index fd775b4a..eab92644 100644
--- a/modules/exllama_hf.py
+++ b/modules/exllama_hf.py
@@ -7,7 +7,7 @@ from torch.nn import CrossEntropyLoss
 from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel
 from transformers.modeling_outputs import CausalLMOutputWithPast
 
-from modules import shared
+from modules import RoPE, shared
 from modules.logging_colors import logger
 
 try:
@@ -29,10 +29,16 @@ class ExllamaHF(PreTrainedModel):
         super().__init__(PretrainedConfig())
         self.ex_config = config
         self.ex_model = ExLlama(self.ex_config)
-        self.ex_cache = ExLlamaCache(self.ex_model)
         self.generation_config = GenerationConfig()
         self.lora = None
 
+        self.ex_cache = ExLlamaCache(self.ex_model)
+        self.past_seq = None
+
+        if shared.args.cfg_cache:
+            self.ex_cache_negative = ExLlamaCache(self.ex_model)
+            self.past_seq_negative = None
+
     def _validate_model_class(self):
         pass
 
@@ -47,26 +53,46 @@ class ExllamaHF(PreTrainedModel):
         return torch.device(0)
 
     def __call__(self, *args, **kwargs):
-        # TODO: Some decoding methods (such as Contrastive Search) may not work at this time
-        assert len(args) == 0, 'no *args should be passed to forward'
         use_cache = kwargs.get('use_cache', True)
         labels = kwargs.get('labels', None)
-        seq = kwargs['input_ids'][0].tolist()
-        cache = kwargs['past_key_values'] if 'past_key_values' in kwargs else None
+        past_key_values = kwargs.get('past_key_values', None)
 
-        if labels is None:
-            if cache is None:
-                self.ex_cache.current_seq_len = 0
-                cache = self.ex_cache
-                self.ex_model.forward(torch.tensor([seq[:-1]], dtype=torch.long), cache, preprocess_only=True, lora=self.lora)
+        if len(args) > 0:
+            if not shared.args.cfg_cache:
+                logger.error("Please enable the cfg-cache option to use CFG with ExLlama_HF.")
+                return
 
-            logits = self.ex_model.forward(torch.tensor([seq[-1:]], dtype=torch.long), cache, lora=self.lora).to(kwargs['input_ids'].device)
+            input_ids = args[0]
+            is_negative = True
+            past_seq = self.past_seq_negative
+            ex_cache = self.ex_cache_negative
         else:
-            if cache is None:
-                self.ex_cache.current_seq_len = 0
-                cache = self.ex_cache
+            input_ids = kwargs['input_ids']
+            is_negative = False
+            past_seq = self.past_seq
+            ex_cache = self.ex_cache
 
-            logits = self.ex_model.forward(torch.tensor([seq], dtype=torch.long), cache, last_id_only=False, lora=self.lora)
+        seq = input_ids[0].tolist()
+        if is_negative and past_key_values is not None:
+            seq = past_key_values + seq
+
+        seq_tensor = torch.tensor(seq)
+
+        # Make the forward call
+        if labels is None:
+            if past_seq is None or not torch.equal(past_seq, seq_tensor[:-1]):
+                ex_cache.current_seq_len = 0
+                self.ex_model.forward(torch.tensor([seq[:-1]], dtype=torch.long), ex_cache, preprocess_only=True, lora=self.lora)
+
+            logits = self.ex_model.forward(torch.tensor([seq[-1:]], dtype=torch.long), ex_cache, lora=self.lora).to(input_ids.device)
+        else:
+            ex_cache.current_seq_len = 0
+            logits = self.ex_model.forward(torch.tensor([seq], dtype=torch.long), ex_cache, last_id_only=False, lora=self.lora)
+
+        if is_negative:
+            self.past_seq_negative = seq_tensor
+        else:
+            self.past_seq = seq_tensor
 
         loss = None
         if labels is not None:
@@ -81,7 +107,7 @@ class ExllamaHF(PreTrainedModel):
             shift_labels = shift_labels.to(shift_logits.device)
             loss = loss_fct(shift_logits, shift_labels)
 
-        return CausalLMOutputWithPast(logits=logits, past_key_values=cache if use_cache else None, loss=loss)
+        return CausalLMOutputWithPast(logits=logits, past_key_values=seq if use_cache else None, loss=loss)
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):
@@ -108,8 +134,8 @@ class ExllamaHF(PreTrainedModel):
             config.set_auto_map(shared.args.gpu_split)
             config.gpu_peer_fix = True
 
-        if shared.args.alpha_value:
-            config.alpha_value = shared.args.alpha_value
+        if shared.args.alpha_value > 1 or shared.args.rope_freq_base > 0:
+            config.alpha_value = RoPE.get_alpha_value(shared.args.alpha_value, shared.args.rope_freq_base)
             config.calculate_rotary_embedding_base()
 
         if torch.version.hip:
diff --git a/modules/extensions.py b/modules/extensions.py
index 76b6be8b..796ff072 100644
--- a/modules/extensions.py
+++ b/modules/extensions.py
@@ -53,14 +53,32 @@ def iterator():
 
 
 # Extension functions that map string -> string
-def _apply_string_extensions(function_name, text, state):
+def _apply_string_extensions(function_name, text, state, is_chat=False):
     for extension, _ in iterator():
         if hasattr(extension, function_name):
             func = getattr(extension, function_name)
-            if len(signature(func).parameters) == 2:
-                text = func(text, state)
+
+            # Handle old extensions without the 'state' arg or
+            # the 'is_chat' kwarg
+            count = 0
+            has_chat = False
+            for k in signature(func).parameters:
+                if k == 'is_chat':
+                    has_chat = True
+                else:
+                    count += 1
+
+            if count == 2:
+                args = [text, state]
             else:
-                text = func(text)
+                args = [text]
+
+            if has_chat:
+                kwargs = {'is_chat': is_chat}
+            else:
+                kwargs = {}
+
+            text = func(*args, **kwargs)
 
     return text
 
@@ -169,9 +187,7 @@ def create_extensions_block():
     if len(to_display) > 0:
         with gr.Column(elem_id="extensions"):
             for row in to_display:
-                extension, name = row
-                display_name = getattr(extension, 'params', {}).get('display_name', name)
-                gr.Markdown(f"\n### {display_name}")
+                extension, _ = row
                 extension.ui()
 
 
diff --git a/modules/html_generator.py b/modules/html_generator.py
index ab0aeab0..47ca6095 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -1,3 +1,4 @@
+import html
 import os
 import re
 import time
@@ -6,6 +7,7 @@ from pathlib import Path
 import markdown
 from PIL import Image, ImageOps
 
+from modules.logging_colors import logger
 from modules.utils import get_available_chat_styles
 
 # This is to store the paths to the thumbnails of the profile pictures
@@ -61,8 +63,30 @@ def convert_to_markdown(string):
     if is_code:
         result = result + '```'  # Unfinished code block
 
-    string = result.strip()
-    return markdown.markdown(string, extensions=['fenced_code', 'tables'])
+    result = result.strip()
+
+    # Unfinished list, like "\n1.". A |delete| string is added and then
+    # removed to force a <ol> to be generated instead of a <p>.
+    if re.search(r'(\d+\.?)$', result):
+        delete_str = '|delete|'
+
+        if not result.endswith('.'):
+            result += '.'
+
+        result = re.sub(r'(\d+\.)$', r'\g<1> ' + delete_str, result)
+
+        html_output = markdown.markdown(result, extensions=['fenced_code', 'tables'])
+        pos = html_output.rfind(delete_str)
+        if pos > -1:
+            html_output = html_output[:pos] + html_output[pos + len(delete_str):]
+    else:
+        html_output = markdown.markdown(result, extensions=['fenced_code', 'tables'])
+
+    # Unescape code blocks
+    pattern = re.compile(r'<code[^>]*>(.*?)</code>', re.DOTALL)
+    html_output = pattern.sub(lambda x: html.unescape(x.group()), html_output)
+
+    return html_output
 
 
 def generate_basic_html(string):
@@ -81,7 +105,7 @@ def process_post(post, c):
     src = re.sub('>', '&gt;', src)
     src = re.sub('(&gt;&gt;[0-9]*)', '<span class="quote">\\1</span>', src)
     src = re.sub('\n', '<br>\n', src)
-    src = f'<blockquote class="message">{src}\n'
+    src = f'<blockquote class="message_4chan">{src}\n'
     src = f'<span class="name">Anonymous </span> <span class="number">No.{number}</span>\n{src}'
     return src
 
@@ -102,6 +126,7 @@ def generate_4chan_html(f):
             post = line
         else:
             post += line
+
     if post != '':
         src = process_post(post, c)
         posts.append(src)
@@ -116,13 +141,14 @@ def generate_4chan_html(f):
     output += f'<style>{_4chan_css}</style><div id="parent"><div id="container">'
     for post in posts:
         output += post
+
     output += '</div></div>'
     output = output.split('\n')
     for i in range(len(output)):
         output[i] = re.sub(r'^(&gt;(.*?)(<br>|</div>))', r'<span class="greentext">\1</span>', output[i])
-        output[i] = re.sub(r'^<blockquote class="message">(&gt;(.*?)(<br>|</div>))', r'<blockquote class="message"><span class="greentext">\1</span>', output[i])
-    output = '\n'.join(output)
+        output[i] = re.sub(r'^<blockquote class="message_4chan">(&gt;(.*?)(<br>|</div>))', r'<blockquote class="message_4chan"><span class="greentext">\1</span>', output[i])
 
+    output = '\n'.join(output)
     return output
 
 
@@ -142,7 +168,13 @@ def get_image_cache(path):
     mtime = os.stat(path).st_mtime
     if (path in image_cache and mtime != image_cache[path][0]) or (path not in image_cache):
         img = make_thumbnail(Image.open(path))
-        output_file = Path(f'cache/{path.name}_cache.png')
+
+        old_p = Path(f'cache/{path.name}_cache.png')
+        p = Path(f'cache/cache_{path.name}.png')
+        if old_p.exists():
+            old_p.rename(p)
+
+        output_file = p
         img.convert('RGB').save(output_file, format='PNG')
         image_cache[path] = [mtime, output_file.as_posix()]
 
@@ -150,10 +182,21 @@ def get_image_cache(path):
 
 
 def generate_instruct_html(history):
-    output = f'<style>{instruct_css}</style><div class="chat pretty_scrollbar" id="chat">'
-    for i, _row in enumerate(history[::-1]):
+    output = f'<style>{instruct_css}</style><div class="chat pretty_scrollbar" id="chat"><div class="messages">'
+    for i, _row in enumerate(history):
         row = [convert_to_markdown(entry) for entry in _row]
 
+        if row[0]:  # don't display empty user messages
+            output += f"""
+                  <div class="user-message">
+                    <div class="text">
+                      <div class="message-body">
+                        {row[0]}
+                      </div>
+                    </div>
+                  </div>
+                """
+
         output += f"""
               <div class="assistant-message">
                 <div class="text">
@@ -164,34 +207,38 @@ def generate_instruct_html(history):
               </div>
             """
 
-        if len(row[0]) == 0:  # don't display empty user messages
-            continue
-
-        output += f"""
-              <div class="user-message">
-                <div class="text">
-                  <div class="message-body">
-                    {row[0]}
-                  </div>
-                </div>
-              </div>
-            """
-
-    output += "</div>"
+    output += "</div></div>"
 
     return output
 
 
 def generate_cai_chat_html(history, name1, name2, style, reset_cache=False):
-    output = f'<style>{chat_styles[style]}</style><div class="chat pretty_scrollbar" id="chat">'
+    output = f'<style>{chat_styles[style]}</style><div class="chat pretty_scrollbar" id="chat"><div class="messages">'
 
     # We use ?name2 and ?time.time() to force the browser to reset caches
     img_bot = f'<img src="file/cache/pfp_character.png?{name2}">' if Path("cache/pfp_character.png").exists() else ''
     img_me = f'<img src="file/cache/pfp_me.png?{time.time() if reset_cache else ""}">' if Path("cache/pfp_me.png").exists() else ''
 
-    for i, _row in enumerate(history[::-1]):
+    for i, _row in enumerate(history):
         row = [convert_to_markdown(entry) for entry in _row]
 
+        if row[0]:  # don't display empty user messages
+            output += f"""
+                  <div class="message">
+                    <div class="circle-you">
+                      {img_me}
+                    </div>
+                    <div class="text">
+                      <div class="username">
+                        {name1}
+                      </div>
+                      <div class="message-body">
+                        {row[0]}
+                      </div>
+                    </div>
+                  </div>
+                """
+
         output += f"""
               <div class="message">
                 <div class="circle-bot">
@@ -208,49 +255,18 @@ def generate_cai_chat_html(history, name1, name2, style, reset_cache=False):
               </div>
             """
 
-        if len(row[0]) == 0:  # don't display empty user messages
-            continue
-
-        output += f"""
-              <div class="message">
-                <div class="circle-you">
-                  {img_me}
-                </div>
-                <div class="text">
-                  <div class="username">
-                    {name1}
-                  </div>
-                  <div class="message-body">
-                    {row[0]}
-                  </div>
-                </div>
-              </div>
-            """
-
-    output += "</div>"
+    output += "</div></div>"
     return output
 
 
 def generate_chat_html(history, name1, name2, reset_cache=False):
-    output = f'<style>{chat_styles["wpp"]}</style><div class="chat pretty_scrollbar" id="chat">'
+    output = f'<style>{chat_styles["wpp"]}</style><div class="chat pretty_scrollbar" id="chat"><div class="messages">'
 
-    for i, _row in enumerate(history[::-1]):
+    for i, _row in enumerate(history):
         row = [convert_to_markdown(entry) for entry in _row]
 
-        output += f"""
-              <div class="message">
-                <div class="text-bot">
-                  <div class="message-body">
-                    {row[1]}
-                  </div>
-                </div>
-              </div>
-            """
-
-        if len(row[0]) == 0:  # don't display empty user messages
-            continue
-
-        output += f"""
+        if row[0]:  # don't display empty user messages
+            output += f"""
               <div class="message">
                 <div class="text-you">
                   <div class="message-body">
@@ -260,7 +276,17 @@ def generate_chat_html(history, name1, name2, reset_cache=False):
               </div>
             """
 
-    output += "</div>"
+        output += f"""
+          <div class="message">
+            <div class="text-bot">
+              <div class="message-body">
+                {row[1]}
+              </div>
+            </div>
+          </div>
+        """
+
+    output += "</div></div>"
     return output
 
 
diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py
index 349a5782..918ce7f8 100644
--- a/modules/llamacpp_hf.py
+++ b/modules/llamacpp_hf.py
@@ -7,24 +7,65 @@ from torch.nn import CrossEntropyLoss
 from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel
 from transformers.modeling_outputs import CausalLMOutputWithPast
 
-from modules import shared
+from modules import RoPE, shared
 from modules.logging_colors import logger
+from modules.utils import is_gguf
+
+import llama_cpp
+
+try:
+    import llama_cpp_ggml
+except:
+    llama_cpp_ggml = llama_cpp
 
 if torch.cuda.is_available() and not torch.version.hip:
     try:
-        from llama_cpp_cuda import Llama
+        import llama_cpp_cuda
     except:
-        from llama_cpp import Llama
+        llama_cpp_cuda = None
+    try:
+        import llama_cpp_ggml_cuda
+    except:
+        llama_cpp_ggml_cuda = llama_cpp_cuda
 else:
-    from llama_cpp import Llama
+    llama_cpp_cuda = None
+    llama_cpp_ggml_cuda = None
+
+
+def llama_cpp_lib(model_file: Union[str, Path] = None):
+    if model_file is not None:
+        gguf_model = is_gguf(model_file)
+    else:
+        gguf_model = True
+
+    if shared.args.cpu or llama_cpp_cuda is None:
+        return llama_cpp if gguf_model else llama_cpp_ggml
+    else:
+        return llama_cpp_cuda if gguf_model else llama_cpp_ggml_cuda
 
 
 class LlamacppHF(PreTrainedModel):
-    def __init__(self, model):
+    def __init__(self, model, path):
         super().__init__(PretrainedConfig())
         self.model = model
         self.generation_config = GenerationConfig()
-        self.cache = None
+
+        self.past_seq = None
+        self.llamacpp_cache = {
+            'n_tokens': self.model.n_tokens,
+            'input_ids': self.model.input_ids,
+            'scores': self.model.scores,
+            'ctx': self.model.ctx
+        }
+
+        if shared.args.cfg_cache:
+            self.past_seq_negative = None
+            self.llamacpp_cache_negative = {
+                'n_tokens': self.model.n_tokens,
+                'input_ids': self.model.input_ids.copy(),
+                'scores': self.model.scores.copy(),
+                'ctx': llama_cpp_lib(path).llama_new_context_with_model(model.model, model.params)
+            }
 
     def _validate_model_class(self):
         pass
@@ -35,37 +76,86 @@ class LlamacppHF(PreTrainedModel):
     def prepare_inputs_for_generation(self, input_ids, **kwargs):
         return {'input_ids': input_ids, **kwargs}
 
+    def save_cache(self):
+        self.llamacpp_cache.update({
+            'n_tokens': self.model.n_tokens,
+            'input_ids': self.model.input_ids,
+            'scores': self.model.scores,
+            'ctx': self.model.ctx
+        })
+
+    def save_negative_cache(self):
+        self.llamacpp_cache_negative.update({
+            'n_tokens': self.model.n_tokens,
+            'input_ids': self.model.input_ids,
+            'scores': self.model.scores,
+            'ctx': self.model.ctx
+        })
+
+    def load_cache(self):
+        self.model.n_tokens = self.llamacpp_cache['n_tokens']
+        self.model.input_ids = self.llamacpp_cache['input_ids']
+        self.model.scores = self.llamacpp_cache['scores']
+        self.model.ctx = self.llamacpp_cache['ctx']
+
+    def load_negative_cache(self):
+        self.model.n_tokens = self.llamacpp_cache_negative['n_tokens']
+        self.model.input_ids = self.llamacpp_cache_negative['input_ids']
+        self.model.scores = self.llamacpp_cache_negative['scores']
+        self.model.ctx = self.llamacpp_cache_negative['ctx']
+
     @property
     def device(self) -> torch.device:
         return torch.device(0)
 
     def __call__(self, *args, **kwargs):
-        # TODO: Some decoding methods (such as Contrastive Search) may not work at this time
-        assert len(args) == 0, 'no *args should be passed to forward'
         use_cache = kwargs.get('use_cache', True)
         labels = kwargs.get('labels', None)
-        seq = kwargs['input_ids'][0].tolist()
-        cache = kwargs['past_key_values'] if 'past_key_values' in kwargs else None
+        past_key_values = kwargs.get('past_key_values', None)
+
+        if len(args) > 0:
+            if not shared.args.cfg_cache:
+                logger.error("Please enable the cfg-cache option to use CFG with llamacpp_HF.")
+                return
+
+            input_ids = args[0]
+            is_negative = True
+            past_seq = self.past_seq_negative
+            self.load_negative_cache()
+        else:
+            input_ids = kwargs['input_ids']
+            is_negative = False
+            past_seq = self.past_seq
+            self.load_cache()
+
+        seq = input_ids[0].tolist()
+        if is_negative and past_key_values is not None:
+            seq = past_key_values + seq
+
+        seq_tensor = torch.tensor(seq)
 
         # Make the forward call
-        seq_tensor = torch.tensor(seq)
         if labels is None:
-            if self.cache is None or not torch.equal(self.cache, seq_tensor[:-1]):
+            if past_seq is None or not torch.equal(past_seq, seq_tensor[:-1]):
                 self.model.reset()
                 self.model.eval(seq)
             else:
                 self.model.eval([seq[-1]])
 
-            logits = torch.tensor(self.model.scores[self.model.n_tokens-1, :]).view(1, 1, -1).to(kwargs['input_ids'].device)
+            logits = torch.tensor(self.model.scores[self.model.n_tokens - 1, :]).view(1, 1, -1).to(input_ids.device)
         else:
             self.model.reset()
             self.model.eval(seq)
             logits = torch.tensor(self.model.eval_logits)
-            logits = logits.view(1, logits.shape[0], logits.shape[1]).to(kwargs['input_ids'].device)
+            logits = logits.view(1, logits.shape[0], logits.shape[1]).to(input_ids.device)
 
-        self.cache = seq_tensor
+        if is_negative:
+            self.save_negative_cache()
+            self.past_seq_negative = seq_tensor
+        else:
+            self.save_cache()
+            self.past_seq = seq_tensor
 
-        # Based on transformers/models/llama/modeling_llama.py
         loss = None
         if labels is not None:
             # Shift so that tokens < n predict n
@@ -79,7 +169,7 @@ class LlamacppHF(PreTrainedModel):
             shift_labels = shift_labels.to(shift_logits.device)
             loss = loss_fct(shift_logits, shift_labels)
 
-        return CausalLMOutputWithPast(logits=logits, past_key_values=cache if use_cache else None, loss=loss)
+        return CausalLMOutputWithPast(logits=logits, past_key_values=seq if use_cache else None, loss=loss)
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):
@@ -91,9 +181,15 @@ class LlamacppHF(PreTrainedModel):
         if path.is_file():
             model_file = path
         else:
-            model_file = list(path.glob('*ggml*.bin'))[0]
+            model_file = (list(path.glob('*.gguf*')) + list(path.glob('*ggml*.bin')))[0]
 
         logger.info(f"llama.cpp weights detected: {model_file}\n")
+
+        if shared.args.tensor_split is None or shared.args.tensor_split.strip() == '':
+            tensor_split_list = None
+        else:
+            tensor_split_list = [float(x) for x in shared.args.tensor_split.strip().split(",")]
+
         params = {
             'model_path': str(model_file),
             'n_ctx': shared.args.n_ctx,
@@ -102,14 +198,23 @@ class LlamacppHF(PreTrainedModel):
             'n_batch': shared.args.n_batch,
             'use_mmap': not shared.args.no_mmap,
             'use_mlock': shared.args.mlock,
+            'mul_mat_q': shared.args.mul_mat_q,
             'low_vram': shared.args.low_vram,
             'n_gpu_layers': shared.args.n_gpu_layers,
-            'rope_freq_base': 10000 * shared.args.alpha_value ** (64/63.),
+            'rope_freq_base': RoPE.get_rope_freq_base(shared.args.alpha_value, shared.args.rope_freq_base),
+            'tensor_split': tensor_split_list,
             'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
-            'n_gqa': shared.args.n_gqa or None,
-            'rms_norm_eps': shared.args.rms_norm_eps or None,
             'logits_all': True,
         }
 
+        if not is_gguf(model_file):
+            ggml_params = {
+                'n_gqa': shared.args.n_gqa or None,
+                'rms_norm_eps': shared.args.rms_norm_eps or None,
+            }
+            params = params | ggml_params
+
+        Llama = llama_cpp_lib(model_file).Llama
         model = Llama(**params)
-        return LlamacppHF(model)
+
+        return LlamacppHF(model, model_file)
diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py
index 0f9c3470..12aa3a4f 100644
--- a/modules/llamacpp_model.py
+++ b/modules/llamacpp_model.py
@@ -1,19 +1,47 @@
 import re
 from functools import partial
+from pathlib import Path
+from typing import Union
 
 import torch
 
-from modules import shared
+from modules import RoPE, shared
 from modules.callbacks import Iteratorize
 from modules.logging_colors import logger
+from modules.text_generation import get_max_prompt_length
+from modules.utils import is_gguf
+
+import llama_cpp
+
+try:
+    import llama_cpp_ggml
+except:
+    llama_cpp_ggml = llama_cpp
 
 if torch.cuda.is_available() and not torch.version.hip:
     try:
-        from llama_cpp_cuda import Llama, LlamaCache, LogitsProcessorList
+        import llama_cpp_cuda
     except:
-        from llama_cpp import Llama, LlamaCache, LogitsProcessorList
+        llama_cpp_cuda = None
+    try:
+        import llama_cpp_ggml_cuda
+    except:
+        llama_cpp_ggml_cuda = llama_cpp_cuda
 else:
-    from llama_cpp import Llama, LlamaCache, LogitsProcessorList
+    llama_cpp_cuda = None
+    llama_cpp_ggml_cuda = None
+
+
+def llama_cpp_lib(model_file: Union[str, Path] = None):
+    if model_file is not None:
+        gguf_model = is_gguf(model_file)
+    else:
+        gguf_model = True
+
+    if shared.args.cpu or llama_cpp_cuda is None:
+        return llama_cpp if gguf_model else llama_cpp_ggml
+    else:
+        return llama_cpp_cuda if gguf_model else llama_cpp_ggml_cuda
 
 
 def ban_eos_logits_processor(eos_token, input_ids, logits):
@@ -30,6 +58,10 @@ class LlamaCppModel:
 
     @classmethod
     def from_pretrained(self, path):
+
+        Llama = llama_cpp_lib(path).Llama
+        LlamaCache = llama_cpp_lib(path).LlamaCache
+
         result = self()
         cache_capacity = 0
         if shared.args.cache_capacity is not None:
@@ -41,6 +73,12 @@ class LlamaCppModel:
                 cache_capacity = int(shared.args.cache_capacity)
 
         logger.info("Cache capacity is " + str(cache_capacity) + " bytes")
+
+        if shared.args.tensor_split is None or shared.args.tensor_split.strip() == '':
+            tensor_split_list = None
+        else:
+            tensor_split_list = [float(x) for x in shared.args.tensor_split.strip().split(",")]
+
         params = {
             'model_path': str(path),
             'n_ctx': shared.args.n_ctx,
@@ -49,14 +87,21 @@ class LlamaCppModel:
             'n_batch': shared.args.n_batch,
             'use_mmap': not shared.args.no_mmap,
             'use_mlock': shared.args.mlock,
+            'mul_mat_q': shared.args.mul_mat_q,
             'low_vram': shared.args.low_vram,
             'n_gpu_layers': shared.args.n_gpu_layers,
-            'rope_freq_base': 10000 * shared.args.alpha_value ** (64/63.),
+            'rope_freq_base': RoPE.get_rope_freq_base(shared.args.alpha_value, shared.args.rope_freq_base),
+            'tensor_split': tensor_split_list,
             'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
-            'n_gqa': shared.args.n_gqa or None,
-            'rms_norm_eps': shared.args.rms_norm_eps or None,
         }
 
+        if not is_gguf(path):
+            ggml_params = {
+                'n_gqa': shared.args.n_gqa or None,
+                'rms_norm_eps': shared.args.rms_norm_eps or None,
+            }
+            params = params | ggml_params
+
         result.model = Llama(**params)
         if cache_capacity > 0:
             result.model.set_cache(LlamaCache(capacity_bytes=cache_capacity))
@@ -74,7 +119,16 @@ class LlamaCppModel:
         return self.model.detokenize(tokens)
 
     def generate(self, prompt, state, callback=None):
+
+        LogitsProcessorList = llama_cpp_lib().LogitsProcessorList
+
         prompt = prompt if type(prompt) is str else prompt.decode()
+
+        # Handle truncation
+        prompt = self.encode(prompt)
+        prompt = prompt[-get_max_prompt_length(state):]
+        prompt = self.decode(prompt).decode('utf-8')
+
         completion_chunks = self.model.create_completion(
             prompt=prompt,
             max_tokens=state['max_new_tokens'],
@@ -94,6 +148,8 @@ class LlamaCppModel:
 
         output = ""
         for completion_chunk in completion_chunks:
+            if shared.stop_everything:
+                break
             text = completion_chunk['choices'][0]['text']
             output += text
             if callback:
diff --git a/modules/loaders.py b/modules/loaders.py
index 6d0291bf..45a4e933 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -1,10 +1,47 @@
 import functools
+from collections import OrderedDict
 
 import gradio as gr
 
 from modules import shared
 
-loaders_and_params = {
+loaders_and_params = OrderedDict({
+    'Transformers': [
+        'cpu_memory',
+        'gpu_memory',
+        'trust_remote_code',
+        'load_in_8bit',
+        'bf16',
+        'cpu',
+        'disk',
+        'auto_devices',
+        'load_in_4bit',
+        'use_double_quant',
+        'quant_type',
+        'compute_dtype',
+        'trust_remote_code',
+        'alpha_value',
+        'rope_freq_base',
+        'compress_pos_emb',
+        'transformers_info'
+    ],
+    'ExLlama_HF': [
+        'gpu_split',
+        'max_seq_len',
+        'alpha_value',
+        'rope_freq_base',
+        'compress_pos_emb',
+        'cfg_cache',
+        'exllama_HF_info',
+    ],
+    'ExLlama': [
+        'gpu_split',
+        'max_seq_len',
+        'alpha_value',
+        'rope_freq_base',
+        'compress_pos_emb',
+        'exllama_info',
+    ],
     'AutoGPTQ': [
         'triton',
         'no_inject_fused_attention',
@@ -13,6 +50,7 @@ loaders_and_params = {
         'wbits',
         'groupsize',
         'desc_act',
+        'disable_exllama',
         'gpu_memory',
         'cpu_memory',
         'cpu',
@@ -33,61 +71,48 @@ loaders_and_params = {
         'n_gqa',
         'rms_norm_eps',
         'n_gpu_layers',
+        'tensor_split',
         'n_batch',
         'threads',
         'no_mmap',
         'low_vram',
         'mlock',
+        'mul_mat_q',
         'llama_cpp_seed',
-        'compress_pos_emb',
         'alpha_value',
+        'rope_freq_base',
+        'compress_pos_emb',
+        'cpu',
     ],
     'llamacpp_HF': [
         'n_ctx',
         'n_gqa',
         'rms_norm_eps',
         'n_gpu_layers',
+        'tensor_split',
         'n_batch',
         'threads',
         'no_mmap',
         'low_vram',
         'mlock',
-        'llama_cpp_seed',
-        'compress_pos_emb',
+        'mul_mat_q',
         'alpha_value',
+        'rope_freq_base',
+        'compress_pos_emb',
+        'cpu',
+        'cfg_cache',
         'llamacpp_HF_info',
     ],
-    'Transformers': [
-        'cpu_memory',
-        'gpu_memory',
-        'trust_remote_code',
-        'load_in_8bit',
-        'bf16',
-        'cpu',
-        'disk',
-        'auto_devices',
-        'load_in_4bit',
-        'use_double_quant',
-        'quant_type',
-        'compute_dtype',
-        'trust_remote_code',
-        'transformers_info'
-    ],
-    'ExLlama': [
-        'gpu_split',
-        'max_seq_len',
-        'compress_pos_emb',
-        'alpha_value',
-        'exllama_info',
-    ],
-    'ExLlama_HF': [
-        'gpu_split',
-        'max_seq_len',
-        'compress_pos_emb',
-        'alpha_value',
-        'exllama_HF_info',
+    'ctransformers': [
+        'n_ctx',
+        'n_gpu_layers',
+        'n_batch',
+        'threads',
+        'model_type',
+        'no_mmap',
+        'mlock'
     ]
-}
+})
 
 loaders_samplers = {
     'Transformers': {
@@ -113,9 +138,12 @@ loaders_samplers = {
         'mirostat_mode',
         'mirostat_tau',
         'mirostat_eta',
+        'guidance_scale',
+        'negative_prompt',
         'ban_eos_token',
         'add_bos_token',
         'skip_special_tokens',
+        'auto_max_new_tokens',
     },
     'ExLlama_HF': {
         'temperature',
@@ -136,9 +164,12 @@ loaders_samplers = {
         'mirostat_mode',
         'mirostat_tau',
         'mirostat_eta',
+        'guidance_scale',
+        'negative_prompt',
         'ban_eos_token',
         'add_bos_token',
         'skip_special_tokens',
+        'auto_max_new_tokens',
     },
     'ExLlama': {
         'temperature',
@@ -148,7 +179,10 @@ loaders_samplers = {
         'repetition_penalty',
         'repetition_penalty_range',
         'seed',
+        'guidance_scale',
+        'negative_prompt',
         'ban_eos_token',
+        'auto_max_new_tokens',
     },
     'AutoGPTQ': {
         'temperature',
@@ -173,9 +207,12 @@ loaders_samplers = {
         'mirostat_mode',
         'mirostat_tau',
         'mirostat_eta',
+        'guidance_scale',
+        'negative_prompt',
         'ban_eos_token',
         'add_bos_token',
         'skip_special_tokens',
+        'auto_max_new_tokens',
     },
     'GPTQ-for-LLaMa': {
         'temperature',
@@ -200,9 +237,12 @@ loaders_samplers = {
         'mirostat_mode',
         'mirostat_tau',
         'mirostat_eta',
+        'guidance_scale',
+        'negative_prompt',
         'ban_eos_token',
         'add_bos_token',
         'skip_special_tokens',
+        'auto_max_new_tokens',
     },
     'llama.cpp': {
         'temperature',
@@ -234,10 +274,42 @@ loaders_samplers = {
         'mirostat_mode',
         'mirostat_tau',
         'mirostat_eta',
+        'guidance_scale',
+        'negative_prompt',
         'ban_eos_token',
         'add_bos_token',
         'skip_special_tokens',
+        'auto_max_new_tokens',
     },
+    'ctransformers': {
+        'temperature',
+        'top_p',
+        'top_k',
+        'repetition_penalty',
+        'repetition_penalty_range',
+    }
+}
+
+loaders_model_types = {
+    'GPTQ-for-LLaMa': [
+        "None",
+        "llama",
+        "opt",
+        "gptj"
+    ],
+    'ctransformers': [
+        "None",
+        "gpt2",
+        "gptj",
+        "gptneox",
+        "llama",
+        "mpt",
+        "dollyv2",
+        "replit",
+        "starcoder",
+        "gptbigcode",
+        "falcon"
+    ],
 }
 
 
@@ -259,6 +331,13 @@ def blacklist_samplers(loader):
         return [gr.update(visible=True) if sampler in loaders_samplers[loader] else gr.update(visible=False) for sampler in all_samplers]
 
 
+def get_model_types(loader):
+    if loader in loaders_model_types:
+        return loaders_model_types[loader]
+
+    return ["None"]
+
+
 def get_gpu_memory_keys():
     return [k for k in shared.gradio if k.startswith('gpu_memory')]
 
diff --git a/modules/logits.py b/modules/logits.py
new file mode 100644
index 00000000..3aed6624
--- /dev/null
+++ b/modules/logits.py
@@ -0,0 +1,31 @@
+import torch
+
+from modules import sampler_hijack, shared
+from modules.text_generation import generate_reply
+
+global_scores = None
+
+
+def get_next_logits(prompt, state, use_samplers, previous):
+    if use_samplers:
+        state['max_new_tokens'] = 1
+        state['auto_max_new_tokens'] = False
+        for _ in generate_reply(prompt, state):
+            pass
+
+        scores = sampler_hijack.global_scores[-1]
+    else:
+        tokens = shared.tokenizer.encode(prompt, return_tensors='pt').cuda()
+        output = shared.model(input_ids=tokens)
+        scores = output['logits'][-1][-1]
+
+    probs = torch.softmax(scores, dim=-1, dtype=torch.float)
+    topk_values, topk_indices = torch.topk(probs, k=25, largest=True, sorted=True)
+    topk_values = [f"{float(i):.5f}" for i in topk_values]
+    tokens = [shared.tokenizer.decode(i) for i in topk_indices]
+
+    output = ''
+    for row in list(zip(topk_values, tokens)):
+        output += f"{row[0]}  -  {row[1]}\n"
+
+    return output, previous
diff --git a/modules/models.py b/modules/models.py
index 4866893a..3025fe3d 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -1,9 +1,9 @@
 import gc
+import hashlib
 import os
 import re
 import time
 from pathlib import Path
-import hashlib
 
 import torch
 import transformers
@@ -14,11 +14,11 @@ from transformers import (
     AutoModelForCausalLM,
     AutoModelForSeq2SeqLM,
     AutoTokenizer,
-    BitsAndBytesConfig,
+    BitsAndBytesConfig
 )
 
 import modules.shared as shared
-from modules import llama_attn_hijack, sampler_hijack
+from modules import llama_attn_hijack, RoPE, sampler_hijack
 from modules.logging_colors import logger
 from modules.models_settings import infer_loader
 
@@ -58,7 +58,8 @@ def load_model(model_name, loader=None):
         'llamacpp_HF': llamacpp_HF_loader,
         'RWKV': RWKV_loader,
         'ExLlama': ExLlama_loader,
-        'ExLlama_HF': ExLlama_HF_loader
+        'ExLlama_HF': ExLlama_HF_loader,
+        'ctransformers': ctransformers_loader,
     }
 
     p = Path(model_name)
@@ -144,7 +145,7 @@ def huggingface_loader(model_name):
             LoaderClass = AutoModelForCausalLM
 
     # Load the model in simple 16-bit mode by default
-    if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.load_in_4bit, shared.args.auto_devices, shared.args.disk, shared.args.deepspeed, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None]):
+    if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.load_in_4bit, shared.args.auto_devices, shared.args.disk, shared.args.deepspeed, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.compress_pos_emb > 1, shared.args.alpha_value > 1]):
         model = LoaderClass.from_pretrained(Path(f"{shared.args.model_dir}/{model_name}"), low_cpu_mem_usage=True, torch_dtype=torch.bfloat16 if shared.args.bf16 else torch.float16, trust_remote_code=shared.args.trust_remote_code)
         if torch.backends.mps.is_available():
             device = torch.device('mps')
@@ -215,6 +216,11 @@ def huggingface_loader(model_name):
                 no_split_module_classes=model._no_split_modules
             )
 
+        if shared.args.compress_pos_emb > 1:
+            params['rope_scaling'] = {'type': 'linear', 'factor': shared.args.compress_pos_emb}
+        elif shared.args.alpha_value > 1:
+            params['rope_scaling'] = {'type': 'dynamic', 'factor': RoPE.get_alpha_value(shared.args.alpha_value, shared.args.rope_freq_base)}
+
         model = LoaderClass.from_pretrained(checkpoint, **params)
 
     return model
@@ -235,9 +241,9 @@ def llamacpp_loader(model_name):
     if path.is_file():
         model_file = path
     else:
-        model_file = list(Path(f'{shared.args.model_dir}/{model_name}').glob('*ggml*.bin'))[0]
+        model_file = (list(Path(f'{shared.args.model_dir}/{model_name}').glob('*.gguf*')) + list(Path(f'{shared.args.model_dir}/{model_name}').glob('*ggml*.bin')))[0]
 
-    logger.info(f"llama.cpp weights detected: {model_file}\n")
+    logger.info(f"llama.cpp weights detected: {model_file}")
     model, tokenizer = LlamaCppModel.from_pretrained(model_file)
     return model, tokenizer
 
@@ -263,6 +269,33 @@ def llamacpp_HF_loader(model_name):
     return model, tokenizer
 
 
+def ctransformers_loader(model_name):
+    from modules.ctransformers_model import CtransformersModel
+
+    path = Path(f'{shared.args.model_dir}/{model_name}')
+    ctrans = CtransformersModel()
+    if ctrans.model_type_is_auto():
+        model_file = path
+    else:
+        if path.is_file():
+            model_file = path
+        else:
+            entries = Path(f'{shared.args.model_dir}/{model_name}')
+            gguf = list(entries.glob('*.gguf'))
+            bin = list(entries.glob('*.bin'))
+            if len(gguf) > 0:
+                model_file = gguf[0]
+            elif len(bin) > 0:
+                model_file = bin[0]
+            else:
+                logger.error("Could not find a model for ctransformers.")
+                return None, None
+
+    logger.info(f'ctransformers weights detected: {model_file}')
+    model, tokenizer = ctrans.from_pretrained(model_file)
+    return model, tokenizer
+
+
 def GPTQ_loader(model_name):
 
     # Monkey patch
diff --git a/modules/models_settings.py b/modules/models_settings.py
index 00a6b90f..c55b1e88 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -24,11 +24,11 @@ def infer_loader(model_name):
         loader = None
     elif Path(f'{shared.args.model_dir}/{model_name}/quantize_config.json').exists() or ('wbits' in model_settings and type(model_settings['wbits']) is int and model_settings['wbits'] > 0):
         loader = 'AutoGPTQ'
-    elif len(list(path_to_model.glob('*ggml*.bin'))) > 0:
+    elif len(list(path_to_model.glob('*.gguf*')) + list(path_to_model.glob('*ggml*.bin'))) > 0:
         loader = 'llama.cpp'
-    elif re.match('.*ggml.*\.bin', model_name.lower()):
+    elif re.match(r'.*\.gguf|.*ggml.*\.bin', model_name.lower()):
         loader = 'llama.cpp'
-    elif re.match('.*rwkv.*\.pth', model_name.lower()):
+    elif re.match(r'.*rwkv.*\.pth', model_name.lower()):
         loader = 'RWKV'
     else:
         loader = 'Transformers'
@@ -91,8 +91,8 @@ def apply_model_settings_to_state(model, state):
         if 'wbits' in model_settings and type(model_settings['wbits']) is int and model_settings['wbits'] > 0:
             loader = 'AutoGPTQ'
 
-        # If the user is using an alternative GPTQ loader, let them keep using it
-        if not (loader == 'AutoGPTQ' and state['loader'] in ['GPTQ-for-LLaMa', 'ExLlama', 'ExLlama_HF']):
+        # If the user is using an alternative loader for the same model type, let them keep using it
+        if not (loader == 'AutoGPTQ' and state['loader'] in ['GPTQ-for-LLaMa', 'ExLlama', 'ExLlama_HF']) and not (loader == 'llama.cpp' and state['loader'] in ['llamacpp_HF', 'ctransformers']):
             state['loader'] = loader
 
     for k in model_settings:
diff --git a/modules/presets.py b/modules/presets.py
index 072b15fd..32b7f71c 100644
--- a/modules/presets.py
+++ b/modules/presets.py
@@ -9,6 +9,7 @@ def default_preset():
         'do_sample': True,
         'temperature': 1,
         'top_p': 1,
+        'top_k': 0,
         'typical_p': 1,
         'epsilon_cutoff': 0,
         'eta_cutoff': 0,
@@ -17,19 +18,23 @@ def default_preset():
         'repetition_penalty': 1,
         'repetition_penalty_range': 0,
         'encoder_repetition_penalty': 1,
-        'top_k': 0,
-        'num_beams': 1,
-        'penalty_alpha': 0,
-        'min_length': 0,
-        'length_penalty': 1,
         'no_repeat_ngram_size': 0,
-        'early_stopping': False,
+        'min_length': 0,
+        'guidance_scale': 1,
         'mirostat_mode': 0,
         'mirostat_tau': 5.0,
         'mirostat_eta': 0.1,
+        'penalty_alpha': 0,
+        'num_beams': 1,
+        'length_penalty': 1,
+        'early_stopping': False,
     }
 
 
+def presets_params():
+    return [k for k in default_preset()]
+
+
 def load_preset(name):
     generate_params = default_preset()
     if name not in ['None', None, '']:
@@ -51,12 +56,12 @@ def load_preset_memoized(name):
 def load_preset_for_ui(name, state):
     generate_params = load_preset(name)
     state.update(generate_params)
-    return state, *[generate_params[k] for k in ['do_sample', 'temperature', 'top_p', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'repetition_penalty', 'repetition_penalty_range', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'tfs', 'top_a']]
+    return state, *[generate_params[k] for k in presets_params()]
 
 
 def generate_preset_yaml(state):
     defaults = default_preset()
-    data = {k: state[k] for k in ['do_sample', 'temperature', 'top_p', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'repetition_penalty', 'repetition_penalty_range', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'tfs', 'top_a']}
+    data = {k: state[k] for k in presets_params()}
 
     # Remove entries that are identical to the defaults
     for k in list(data.keys()):
diff --git a/modules/prompts.py b/modules/prompts.py
new file mode 100644
index 00000000..1d1a66b5
--- /dev/null
+++ b/modules/prompts.py
@@ -0,0 +1,51 @@
+from pathlib import Path
+
+import yaml
+
+from modules import utils
+from modules.text_generation import get_encoded_length
+
+
+def load_prompt(fname):
+    if fname in ['None', '']:
+        return ''
+    else:
+        file_path = Path(f'prompts/{fname}.txt')
+        if not file_path.exists():
+            return ''
+
+        with open(file_path, 'r', encoding='utf-8') as f:
+            text = f.read()
+            if text[-1] == '\n':
+                text = text[:-1]
+
+            return text
+
+
+def load_instruction_prompt_simple(fname):
+    file_path = Path(f'instruction-templates/{fname}.yaml')
+    if not file_path.exists():
+        return ''
+
+    with open(file_path, 'r', encoding='utf-8') as f:
+        data = yaml.safe_load(f)
+        output = ''
+        if 'context' in data:
+            output += data['context']
+
+        replacements = {
+            '<|user|>': data['user'],
+            '<|bot|>': data['bot'],
+            '<|user-message|>': 'Input',
+        }
+
+        output += utils.replace_all(data['turn_template'].split('<|bot-message|>')[0], replacements)
+        return output.rstrip(' ')
+
+
+def count_tokens(text):
+    try:
+        tokens = get_encoded_length(text)
+        return str(tokens)
+    except:
+        return '-1'
diff --git a/modules/sampler_hijack.py b/modules/sampler_hijack.py
index 0a86b4fd..0a724f47 100644
--- a/modules/sampler_hijack.py
+++ b/modules/sampler_hijack.py
@@ -10,6 +10,8 @@ from transformers.generation.logits_process import (
     TemperatureLogitsWarper
 )
 
+global_scores = None
+
 
 class TailFreeLogitsWarper(LogitsWarper):
     def __init__(self, tfs: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
@@ -104,7 +106,7 @@ class MirostatLogitsWarper(LogitsWarper):
                 break
 
         # Normalize the probabilities of the remaining words
-        prob_topk = torch.softmax(sorted_logits, dim=0)
+        prob_topk = torch.softmax(sorted_logits, dim=0).to('cuda')
 
         prev_i = torch.multinomial(prob_topk, num_samples=1, replacement=True).to('cuda')
 
@@ -122,6 +124,16 @@ class MirostatLogitsWarper(LogitsWarper):
         return scores
 
 
+class SpyLogitsWarper(LogitsWarper):
+    def __init__(self):
+        pass
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        global global_scores
+        global_scores = scores
+        return scores
+
+
 class RepetitionPenaltyLogitsProcessorWithRange(LogitsProcessor):
     '''
     Copied from the transformers library
@@ -168,6 +180,7 @@ def get_logits_warper_patch(self, generation_config):
     else:
         warpers += warpers_to_add
 
+    warpers.append(SpyLogitsWarper())
     return warpers
 
 
diff --git a/modules/shared.py b/modules/shared.py
index 59d49ab6..ca68c1a6 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -6,63 +6,59 @@ import yaml
 
 from modules.logging_colors import logger
 
-generation_lock = None
+
+# Model variables
 model = None
 tokenizer = None
-is_seq2seq = False
 model_name = "None"
-lora_names = []
+is_seq2seq = False
 model_dirty_from_training = False
+lora_names = []
 
-# Chat variables
+# Generation variables
 stop_everything = False
+generation_lock = None
 processing_message = '*Is typing...*'
 
-# UI elements (buttons, sliders, HTML, etc)
+# UI variables
 gradio = {}
-
-# For keeping the values of UI elements on page reload
 persistent_interface_state = {}
-
-input_params = []  # Generation input parameters
-reload_inputs = []  # Parameters for reloading the chat interface
-
-# For restarting the interface
 need_restart = False
+session_is_loading = False
 
+# UI defaults
 settings = {
     'dark_theme': True,
-    'autoload_model': False,
+    'show_controls': True,
+    'start_with': '',
+    'mode': 'chat',
+    'chat_style': 'TheEncrypted777',
+    'character': 'None',
+    'prompt-default': 'QA',
+    'prompt-notebook': 'QA',
+    'preset': 'simple-1',
     'max_new_tokens': 200,
     'max_new_tokens_min': 1,
     'max_new_tokens_max': 4096,
     'seed': -1,
-    'character': 'None',
+    'negative_prompt': '',
+    'truncation_length': 2048,
+    'truncation_length_min': 0,
+    'truncation_length_max': 16384,
+    'custom_stopping_strings': '',
+    'auto_max_new_tokens': False,
+    'ban_eos_token': False,
+    'add_bos_token': True,
+    'skip_special_tokens': True,
+    'stream': True,
     'name1': 'You',
     'name2': 'Assistant',
     'context': 'This is a conversation with your Assistant. It is a computer program designed to help you with various tasks such as answering questions, providing recommendations, and helping with decision making. You can ask it anything you want and it will do its best to give you accurate and relevant information.',
     'greeting': '',
-    'turn_template': '',
-    'custom_stopping_strings': '',
-    'stop_at_newline': False,
-    'add_bos_token': True,
-    'ban_eos_token': False,
-    'skip_special_tokens': True,
-    'truncation_length': 2048,
-    'truncation_length_min': 0,
-    'truncation_length_max': 16384,
-    'mode': 'chat',
-    'start_with': '',
-    'chat_style': 'TheEncrypted777',
     'instruction_template': 'None',
     'chat-instruct_command': 'Continue the chat dialogue below. Write a single reply for the character "<|character|>".\n\n<|prompt|>',
-    'chat_generation_attempts': 1,
-    'chat_generation_attempts_min': 1,
-    'chat_generation_attempts_max': 10,
-    'default_extensions': [],
-    'chat_default_extensions': ['gallery'],
-    'preset': 'simple-1',
-    'prompt': 'QA',
+    'autoload_model': False,
+    'default_extensions': ['gallery'],
 }
 
 
@@ -80,8 +76,8 @@ def str2bool(v):
 parser = argparse.ArgumentParser(formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=54))
 
 # Basic settings
-parser.add_argument('--notebook', action='store_true', help='Launch the web UI in notebook mode, where the output is written to the same text box as the input.')
-parser.add_argument('--chat', action='store_true', help='Launch the web UI in chat mode with a style similar to the Character.AI website.')
+parser.add_argument('--notebook', action='store_true', help='DEPRECATED')
+parser.add_argument('--chat', action='store_true', help='DEPRECATED')
 parser.add_argument('--multi-user', action='store_true', help='Multi-user mode. Chat histories are not saved or automatically loaded. WARNING: this is highly experimental.')
 parser.add_argument('--character', type=str, help='The name of the character to load in chat mode by default.')
 parser.add_argument('--model', type=str, help='Name of the model to load by default.')
@@ -89,7 +85,7 @@ parser.add_argument('--lora', type=str, nargs="+", help='The list of LoRAs to lo
 parser.add_argument("--model-dir", type=str, default='models/', help="Path to directory with all the models")
 parser.add_argument("--lora-dir", type=str, default='loras/', help="Path to directory with all the loras")
 parser.add_argument('--model-menu', action='store_true', help='Show a model menu in the terminal when the web UI is first launched.')
-parser.add_argument('--no-stream', action='store_true', help='Don\'t stream the text output in real time.')
+parser.add_argument('--no-stream', action='store_true', help='DEPRECATED')
 parser.add_argument('--settings', type=str, help='Load the default interface settings from this yaml file. See settings-template.yaml for an example. If you create a file called settings.yaml, this file will be loaded by default without the need to use the --settings flag.')
 parser.add_argument('--extensions', type=str, nargs="+", help='The list of extensions to load. If you want to load more than one extension, write the names separated by spaces.')
 parser.add_argument('--verbose', action='store_true', help='Print the prompts to the terminal.')
@@ -123,12 +119,14 @@ parser.add_argument('--n_batch', type=int, default=512, help='Maximum number of
 parser.add_argument('--no-mmap', action='store_true', help='Prevent mmap from being used.')
 parser.add_argument('--low-vram', action='store_true', help='Low VRAM Mode')
 parser.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.')
+parser.add_argument('--mul_mat_q', action='store_true', help='Activate new mulmat kernels.')
 parser.add_argument('--cache-capacity', type=str, help='Maximum cache capacity. Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed.')
 parser.add_argument('--n-gpu-layers', type=int, default=0, help='Number of layers to offload to the GPU.')
+parser.add_argument('--tensor_split', type=str, default=None, help="Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17")
 parser.add_argument('--n_ctx', type=int, default=2048, help='Size of the prompt context.')
 parser.add_argument('--llama_cpp_seed', type=int, default=0, help='Seed for llama-cpp models. Default 0 (random)')
-parser.add_argument('--n_gqa', type=int, default=0, help='grouped-query attention. Must be 8 for llama2 70b.')
-parser.add_argument('--rms_norm_eps', type=float, default=0, help='Must be 1e-5 for llama2 70b.')
+parser.add_argument('--n_gqa', type=int, default=0, help='grouped-query attention. Must be 8 for llama-2 70b.')
+parser.add_argument('--rms_norm_eps', type=float, default=0, help='5e-6 is a good value for llama-2 models.')
 
 # GPTQ
 parser.add_argument('--wbits', type=int, default=0, help='Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.')
@@ -137,22 +135,19 @@ parser.add_argument('--groupsize', type=int, default=-1, help='Group size.')
 parser.add_argument('--pre_layer', type=int, nargs="+", help='The number of layers to allocate to the GPU. Setting this parameter enables CPU offloading for 4-bit models. For multi-gpu, write the numbers separated by spaces, eg --pre_layer 30 60.')
 parser.add_argument('--checkpoint', type=str, help='The path to the quantized checkpoint file. If not specified, it will be automatically detected.')
 parser.add_argument('--monkey-patch', action='store_true', help='Apply the monkey patch for using LoRAs with quantized models.')
-parser.add_argument('--quant_attn', action='store_true', help='(triton) Enable quant attention.')
-parser.add_argument('--warmup_autotune', action='store_true', help='(triton) Enable warmup autotune.')
-parser.add_argument('--fused_mlp', action='store_true', help='(triton) Enable fused mlp.')
 
 # AutoGPTQ
-parser.add_argument('--gptq-for-llama', action='store_true', help='DEPRECATED')
-parser.add_argument('--autogptq', action='store_true', help='DEPRECATED')
 parser.add_argument('--triton', action='store_true', help='Use triton.')
 parser.add_argument('--no_inject_fused_attention', action='store_true', help='Do not use fused attention (lowers VRAM requirements).')
 parser.add_argument('--no_inject_fused_mlp', action='store_true', help='Triton mode only: Do not use fused MLP (lowers VRAM requirements).')
 parser.add_argument('--no_use_cuda_fp16', action='store_true', help='This can make models faster on some systems.')
 parser.add_argument('--desc_act', action='store_true', help='For models that don\'t have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig.')
+parser.add_argument('--disable_exllama', action='store_true', help='Disable ExLlama kernel, which can improve inference speed on some systems.')
 
 # ExLlama
 parser.add_argument('--gpu-split', type=str, help="Comma-separated list of VRAM (in GB) to use per GPU device for model layers, e.g. 20,7,7")
 parser.add_argument('--max_seq_len', type=int, default=2048, help="Maximum sequence length.")
+parser.add_argument('--cfg-cache', action='store_true', help="ExLlama_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader, but not necessary for CFG with base ExLlama.")
 
 # DeepSpeed
 parser.add_argument('--deepspeed', action='store_true', help='Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.')
@@ -164,8 +159,9 @@ parser.add_argument('--rwkv-strategy', type=str, default=None, help='RWKV: The s
 parser.add_argument('--rwkv-cuda-on', action='store_true', help='RWKV: Compile the CUDA kernel for better performance.')
 
 # RoPE
-parser.add_argument('--compress_pos_emb', type=int, default=1, help="Positional embeddings compression factor. Should typically be set to max_seq_len / 2048.")
-parser.add_argument('--alpha_value', type=int, default=1, help="Positional embeddings alpha factor for NTK RoPE scaling. Scaling is not identical to embedding compression. Use either this or compress_pos_emb, not both.")
+parser.add_argument('--alpha_value', type=int, default=1, help="Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.")
+parser.add_argument('--rope_freq_base', type=int, default=0, help="If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63).")
+parser.add_argument('--compress_pos_emb', type=int, default=1, help="Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.")
 
 # Gradio
 parser.add_argument('--listen', action='store_true', help='Make the web UI reachable from your local network.')
@@ -175,12 +171,15 @@ parser.add_argument('--share', action='store_true', help='Create a public URL. T
 parser.add_argument('--auto-launch', action='store_true', default=False, help='Open the web UI in the default browser upon launch.')
 parser.add_argument("--gradio-auth", type=str, help='set gradio authentication like "username:password"; or comma-delimit multiple like "u1:p1,u2:p2,u3:p3"', default=None)
 parser.add_argument("--gradio-auth-path", type=str, help='Set the gradio authentication file path. The file should contain one or more user:password pairs in this format: "u1:p1,u2:p2,u3:p3"', default=None)
+parser.add_argument("--ssl-keyfile", type=str, help='The path to the SSL certificate key file.', default=None)
+parser.add_argument("--ssl-certfile", type=str, help='The path to the SSL certificate cert file.', default=None)
 
 # API
 parser.add_argument('--api', action='store_true', help='Enable the API extension.')
 parser.add_argument('--api-blocking-port', type=int, default=5000, help='The listening port for the blocking API.')
 parser.add_argument('--api-streaming-port', type=int, default=5005, help='The listening port for the streaming API.')
 parser.add_argument('--public-api', action='store_true', help='Create a public URL for the API using Cloudfare.')
+parser.add_argument('--public-api-id', type=str, help='Tunnel ID for named Cloudflare Tunnel. Use together with public-api option.', default=None)
 
 # Multimodal
 parser.add_argument('--multimodal-pipeline', type=str, default=None, help='The multimodal pipeline to use. Examples: llava-7b, llava-13b.')
@@ -189,12 +188,9 @@ args = parser.parse_args()
 args_defaults = parser.parse_args([])
 
 # Deprecation warnings
-if args.autogptq:
-    logger.warning('--autogptq has been deprecated and will be removed soon. Use --loader autogptq instead.')
-    args.loader = 'autogptq'
-if args.gptq_for_llama:
-    logger.warning('--gptq-for-llama has been deprecated and will be removed soon. Use --loader gptq-for-llama instead.')
-    args.loader = 'gptq-for-llama'
+for k in ['chat', 'notebook', 'no_stream']:
+    if getattr(args, k):
+        logger.warning(f'The --{k} flag has been deprecated and will be removed soon. Please remove that flag.')
 
 # Security warnings
 if args.trust_remote_code:
@@ -206,6 +202,9 @@ if args.multi_user:
 
 
 def fix_loader_name(name):
+    if not name:
+        return name
+
     name = name.lower()
     if name in ['llamacpp', 'llama.cpp', 'llama-cpp', 'llama cpp']:
         return 'llama.cpp'
@@ -221,10 +220,8 @@ def fix_loader_name(name):
         return 'ExLlama'
     elif name in ['exllama-hf', 'exllama_hf', 'exllama hf', 'ex-llama-hf', 'ex_llama_hf']:
         return 'ExLlama_HF'
-
-
-if args.loader is not None:
-    args.loader = fix_loader_name(args.loader)
+    elif name in ['ctransformers', 'ctranforemrs', 'ctransformer']:
+        return 'ctransformers'
 
 
 def add_extension(name):
@@ -234,36 +231,28 @@ def add_extension(name):
         args.extensions.append(name)
 
 
-# Activating the API extension
+def is_chat():
+    return True
+
+
+args.loader = fix_loader_name(args.loader)
+
+# Activate the API extension
 if args.api or args.public_api:
     add_extension('api')
 
-# Activating the multimodal extension
+# Activate the multimodal extension
 if args.multimodal_pipeline is not None:
     add_extension('multimodal')
 
-
-def is_chat():
-    return args.chat
-
-
-def get_mode():
-    if args.chat:
-        return 'chat'
-    elif args.notebook:
-        return 'notebook'
-    else:
-        return 'default'
-
-
-# Loading model-specific settings
+# Load model-specific settings
 with Path(f'{args.model_dir}/config.yaml') as p:
     if p.exists():
         model_config = yaml.safe_load(open(p, 'r').read())
     else:
         model_config = {}
 
-# Applying user-defined model settings
+# Load custom model-specific settings
 with Path(f'{args.model_dir}/config-user.yaml') as p:
     if p.exists():
         user_config = yaml.safe_load(open(p, 'r').read())
diff --git a/modules/text_generation.py b/modules/text_generation.py
index e1be6aa3..5128e503 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -1,5 +1,6 @@
 import ast
 import copy
+import html
 import random
 import re
 import time
@@ -31,15 +32,71 @@ def generate_reply(*args, **kwargs):
         shared.generation_lock.release()
 
 
-def get_max_prompt_length(state):
-    return state['truncation_length'] - state['max_new_tokens']
+def _generate_reply(question, state, stopping_strings=None, is_chat=False, escape_html=False):
+
+    # Find the appropriate generation function
+    generate_func = apply_extensions('custom_generate_reply')
+    if generate_func is None:
+        if shared.model_name == 'None' or shared.model is None:
+            logger.error("No model is loaded! Select one in the Model tab.")
+            yield ''
+            return
+
+        if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'ExllamaModel', 'CtransformersModel']:
+            generate_func = generate_reply_custom
+        else:
+            generate_func = generate_reply_HF
+
+    # Prepare the input
+    original_question = question
+    if not is_chat:
+        state = apply_extensions('state', state)
+        question = apply_extensions('input', question, state)
+
+    # Find the stopping strings
+    all_stop_strings = []
+    for st in (stopping_strings, ast.literal_eval(f"[{state['custom_stopping_strings']}]")):
+        if type(st) is list and len(st) > 0:
+            all_stop_strings += st
+
+    if shared.args.verbose:
+        print(f'\n\n{question}\n--------------------\n')
+
+    shared.stop_everything = False
+    clear_torch_cache()
+    seed = set_manual_seed(state['seed'])
+    last_update = -1
+    reply = ''
+    is_stream = state['stream']
+    if len(all_stop_strings) > 0 and not state['stream']:
+        state = copy.deepcopy(state)
+        state['stream'] = True
+
+    # Generate
+    for reply in generate_func(question, original_question, seed, state, stopping_strings, is_chat=is_chat):
+        if escape_html:
+            reply = html.escape(reply)
+
+        reply, stop_found = apply_stopping_strings(reply, all_stop_strings)
+        if is_stream:
+            cur_time = time.time()
+            if cur_time - last_update > 0.041666666666666664:  # Limit streaming to 24 fps
+                last_update = cur_time
+                yield reply
+
+        if stop_found:
+            break
+
+    if not is_chat:
+        reply = apply_extensions('output', reply, state)
+
+    yield reply
 
 
 def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_length=None):
-    if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel']:
+    if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'CtransformersModel']:
         input_ids = shared.tokenizer.encode(str(prompt))
         input_ids = np.array(input_ids).reshape(1, len(input_ids))
-        return input_ids
     else:
         input_ids = shared.tokenizer.encode(str(prompt), return_tensors='pt', add_special_tokens=add_special_tokens)
 
@@ -51,7 +108,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
     if truncation_length is not None:
         input_ids = input_ids[:, -truncation_length:]
 
-    if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'ExllamaModel'] or shared.args.cpu:
+    if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'ExllamaModel', 'CtransformersModel'] or shared.args.cpu:
         return input_ids
     elif shared.args.deepspeed:
         return input_ids.to(device=local_rank)
@@ -62,6 +119,10 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
         return input_ids.cuda()
 
 
+def decode(output_ids, skip_special_tokens=True):
+    return shared.tokenizer.decode(output_ids, skip_special_tokens)
+
+
 def get_encoded_length(prompt):
     length_after_extensions = apply_extensions('tokenized_length', prompt)
     if length_after_extensions is not None:
@@ -70,12 +131,36 @@ def get_encoded_length(prompt):
     return len(encode(prompt)[0])
 
 
-def decode(output_ids, skip_special_tokens=True):
-    return shared.tokenizer.decode(output_ids, skip_special_tokens)
+def get_max_prompt_length(state):
+    return state['truncation_length'] - state['max_new_tokens']
+
+
+def generate_reply_wrapper(question, state, stopping_strings=None):
+    """
+    Returns formatted outputs for the UI
+    """
+    reply = question if not shared.is_seq2seq else ''
+    yield formatted_outputs(reply, shared.model_name)
+
+    for reply in generate_reply(question, state, stopping_strings, is_chat=False, escape_html=True):
+        if not shared.is_seq2seq:
+            reply = question + reply
+
+        yield formatted_outputs(reply, shared.model_name)
+
+
+def formatted_outputs(reply, model_name):
+    if any(s in model_name for s in ['gpt-4chan', 'gpt4chan']):
+        reply = fix_gpt4chan(reply)
+        return html.unescape(reply), generate_4chan_html(reply)
+    else:
+        return html.unescape(reply), generate_basic_html(reply)
 
 
-# Removes empty replies from gpt4chan outputs
 def fix_gpt4chan(s):
+    """
+    Removes empty replies from gpt4chan outputs
+    """
     for i in range(10):
         s = re.sub("--- [0-9]*\n>>[0-9]*\n---", "---", s)
         s = re.sub("--- [0-9]*\n *\n---", "---", s)
@@ -84,8 +169,10 @@ def fix_gpt4chan(s):
     return s
 
 
-# Fix the LaTeX equations in galactica
 def fix_galactica(s):
+    """
+    Fix the LaTeX equations in GALACTICA
+    """
     s = s.replace(r'\[', r'$')
     s = s.replace(r'\]', r'$')
     s = s.replace(r'\(', r'$')
@@ -110,14 +197,6 @@ def get_reply_from_output_ids(output_ids, input_ids, original_question, state, i
     return reply
 
 
-def formatted_outputs(reply, model_name):
-    if any(s in model_name for s in ['gpt-4chan', 'gpt4chan']):
-        reply = fix_gpt4chan(reply)
-        return reply, generate_4chan_html(reply)
-    else:
-        return reply, generate_basic_html(reply)
-
-
 def set_manual_seed(seed):
     seed = int(seed)
     if seed == -1:
@@ -134,17 +213,6 @@ def stop_everything_event():
     shared.stop_everything = True
 
 
-def generate_reply_wrapper(question, state, stopping_strings=None):
-    reply = question if not shared.is_seq2seq else ''
-    yield formatted_outputs(reply, shared.model_name)
-
-    for reply in generate_reply(question, state, stopping_strings, is_chat=False):
-        if not shared.is_seq2seq:
-            reply = question + reply
-
-        yield formatted_outputs(reply, shared.model_name)
-
-
 def apply_stopping_strings(reply, all_stop_strings):
     stop_found = False
     for string in all_stop_strings:
@@ -170,66 +238,14 @@ def apply_stopping_strings(reply, all_stop_strings):
     return reply, stop_found
 
 
-def _generate_reply(question, state, stopping_strings=None, is_chat=False):
-    generate_func = apply_extensions('custom_generate_reply')
-    if generate_func is None:
-        if shared.model_name == 'None' or shared.model is None:
-            logger.error("No model is loaded! Select one in the Model tab.")
-            yield ''
-            return
-
-        if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'ExllamaModel']:
-            generate_func = generate_reply_custom
-        else:
-            generate_func = generate_reply_HF
-
-    # Preparing the input
-    original_question = question
-    if not is_chat:
-        state = apply_extensions('state', state)
-        question = apply_extensions('input', question, state)
-
-    # Finding the stopping strings
-    all_stop_strings = []
-    for st in (stopping_strings, ast.literal_eval(f"[{state['custom_stopping_strings']}]")):
-        if type(st) is list and len(st) > 0:
-            all_stop_strings += st
-
-    if shared.args.verbose:
-        print(f'\n\n{question}\n--------------------\n')
-
-    shared.stop_everything = False
-    clear_torch_cache()
-    seed = set_manual_seed(state['seed'])
-    last_update = -1
-    reply = ''
-    is_stream = state['stream']
-    if len(all_stop_strings) > 0 and not state['stream']:
-        state = copy.deepcopy(state)
-        state['stream'] = True
-
-    for reply in generate_func(question, original_question, seed, state, stopping_strings, is_chat=is_chat):
-        reply, stop_found = apply_stopping_strings(reply, all_stop_strings)
-        if is_stream:
-            cur_time = time.time()
-            if cur_time - last_update > 0.041666666666666664:  # Limit streaming to 24 fps
-                last_update = cur_time
-                yield reply
-
-        if stop_found:
-            break
-
-    if not is_chat:
-        reply = apply_extensions('output', reply, state)
-
-    yield reply
-
-
 def generate_reply_HF(question, original_question, seed, state, stopping_strings=None, is_chat=False):
     generate_params = {}
-    for k in ['max_new_tokens', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'repetition_penalty_range', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'tfs', 'top_a', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta']:
+    for k in ['max_new_tokens', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'repetition_penalty_range', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'tfs', 'top_a', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'guidance_scale']:
         generate_params[k] = state[k]
 
+    if state['negative_prompt'] != '':
+        generate_params['negative_prompt_ids'] = encode(state['negative_prompt'])
+
     for k in ['epsilon_cutoff', 'eta_cutoff']:
         if state[k] > 0:
             generate_params[k] = state[k] * 1e-4
@@ -247,6 +263,8 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings
     input_ids = encode(question, add_bos_token=state['add_bos_token'], truncation_length=get_max_prompt_length(state))
     output = input_ids[0]
     cuda = not any((shared.args.cpu, shared.args.deepspeed))
+    if state['auto_max_new_tokens']:
+        generate_params['max_new_tokens'] = state['truncation_length'] - input_ids.shape[-1]
 
     # Add the encoded tokens to generate_params
     question, input_ids, inputs_embeds = apply_extensions('tokenizer', state, question, input_ids, None)
@@ -312,6 +330,9 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings
 
 
 def generate_reply_custom(question, original_question, seed, state, stopping_strings=None, is_chat=False):
+    """
+    For models that do not use the transformers library for sampling
+    """
     seed = set_manual_seed(state['seed'])
 
     t0 = time.time()
diff --git a/modules/training.py b/modules/training.py
index 429bbb1f..3d7cdc91 100644
--- a/modules/training.py
+++ b/modules/training.py
@@ -17,15 +17,18 @@ from pathlib import Path
 import gradio as gr
 import torch
 import transformers
-from modules.models import load_model, unload_model
-
 from datasets import Dataset, load_dataset
 from peft import (
     LoraConfig,
     get_peft_model,
-    prepare_model_for_int8_training,
+    prepare_model_for_kbit_training,
     set_peft_model_state_dict
 )
+from peft.utils.other import \
+    TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING as model_to_lora_modules
+from transformers.models.auto.modeling_auto import (
+    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+)
 
 from modules import shared, ui, utils
 from modules.evaluate import (
@@ -34,131 +37,128 @@ from modules.evaluate import (
     save_past_evaluations
 )
 from modules.logging_colors import logger
+from modules.models import reload_model
 from modules.utils import natural_keys
 
-# This mapping is from a very recent commit, not yet released.
-# If not available, default to a backup map for some common model types.
-try:
-    from peft.utils.other import \
-        TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING as \
-        model_to_lora_modules
-    from transformers.models.auto.modeling_auto import (
-        MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
-    )
-    MODEL_CLASSES = {v: k for k, v in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES}
-except:
-    standard_modules = ["q_proj", "v_proj"]
-    model_to_lora_modules = {"llama": standard_modules, "opt": standard_modules, "gptj": standard_modules, "gpt_neox": ["query_key_value"], "rw": ["query_key_value"]}
-    MODEL_CLASSES = {
-        "LlamaForCausalLM": "llama",
-        "OPTForCausalLM": "opt",
-        "GPTJForCausalLM": "gptj",
-        "GPTNeoXForCausalLM": "gpt_neox",
-        "RWForCausalLM": "rw"
-
-    }
+MODEL_CLASSES = {v[1]: v[0] for v in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.items()}
+PARAMETERS = ["lora_name", "always_override", "save_steps", "micro_batch_size", "batch_size", "epochs", "learning_rate", "lr_scheduler_type", "lora_rank", "lora_alpha", "lora_dropout", "cutoff_len", "dataset", "eval_dataset", "format", "eval_steps", "raw_text_file", "overlap_len", "newline_favor_len", "higher_rank_limit", "warmup_steps", "optimizer", "hard_cut_string", "train_only_after", "stop_at_loss", "add_eos_token", "min_chars", "report_to"]
+WANT_INTERRUPT = False
 
 train_log = {}
 train_template = {}
 
-WANT_INTERRUPT = False
-PARAMETERS = ["lora_name", "always_override", "save_steps", "micro_batch_size", "batch_size", "epochs", "learning_rate", "lr_scheduler_type", "lora_rank", "lora_alpha", "lora_dropout", "cutoff_len", "dataset", "eval_dataset", "format", "eval_steps", "raw_text_file", "overlap_len", "newline_favor_len", "higher_rank_limit", "warmup_steps", "optimizer", "hard_cut_string", "train_only_after", "stop_at_loss", "add_eos_token", "min_chars", "report_to"]
 
-
-def create_train_interface():
-    with gr.Tab('Train LoRA', elem_id='lora-train-tab'):
-        gr.Markdown("Confused? [[Click here for a guide]](https://github.com/oobabooga/text-generation-webui/blob/main/docs/Training-LoRAs.md)")
-
-        with gr.Row():
-            lora_name = gr.Textbox(label='Name', info='The name of your new LoRA file')
-            always_override = gr.Checkbox(label='Override Existing Files', value=False, info='If the name given is the same as an existing file, checking this will replace that file. Leaving unchecked will load that file and continue from it (must use the same rank value as the original had).')
-            save_steps = gr.Number(label='Save every n steps', value=0, info='If above 0, a checkpoint of the LoRA will be saved every time this many steps pass.')
-
-        with gr.Row():
-            copy_from = gr.Dropdown(label='Copy parameters from', value='None', choices=utils.get_available_loras())
-            ui.create_refresh_button(copy_from, lambda: None, lambda: {'choices': utils.get_available_loras()}, 'refresh-button')
-
-        with gr.Row():
-            # TODO: Implement multi-device support.
-            micro_batch_size = gr.Slider(label='Micro Batch Size', value=4, minimum=1, maximum=128, step=1, info='Per-device batch size (NOTE: multiple devices not yet implemented). Increasing this will increase VRAM usage.')
-            batch_size = gr.Slider(label='Batch Size', value=128, minimum=0, maximum=1024, step=4, info='Global batch size. The two batch sizes together determine gradient accumulation (gradientAccum = batch / microBatch). Higher gradient accum values lead to better quality training.')
-
-        with gr.Row():
-            epochs = gr.Number(label='Epochs', value=3, info='Number of times every entry in the dataset should be fed into training. So 1 means feed each item in once, 5 means feed it in five times, etc.')
-            learning_rate = gr.Textbox(label='Learning Rate', value='3e-4', info='Learning rate, in scientific notation. 3e-4 is a good starting base point. 1e-2 is extremely high, 1e-6 is extremely low.')
-            lr_scheduler_type = gr.Dropdown(label='LR Scheduler', value='linear', choices=['linear', 'constant', 'constant_with_warmup', 'cosine', 'cosine_with_restarts', 'polynomial', 'inverse_sqrt'], info='Learning rate scheduler - defines how the learning rate changes over time. "Constant" means never change, "linear" means to go in a straight line from the learning rate down to 0, cosine follows a curve, etc.')
-
-        # TODO: What is the actual maximum rank? Likely distinct per model. This might be better to somehow be on a log scale.
-        lora_rank = gr.Slider(label='LoRA Rank', value=32, minimum=0, maximum=1024, step=4, info='LoRA Rank, or dimension count. Higher values produce a larger file with better control over the model\'s content. Smaller values produce a smaller file with less overall control. Small values like 4 or 8 are great for stylistic guidance, higher values like 128 or 256 are good for teaching content upgrades, extremely high values (1024+) are difficult to train but may improve fine-detail learning for large datasets. Higher ranks also require higher VRAM.')
-        lora_alpha = gr.Slider(label='LoRA Alpha', value=64, minimum=0, maximum=2048, step=4, info='LoRA Alpha. This divided by the rank becomes the scaling of the LoRA. Higher means stronger. A good standard value is twice your Rank.')
-
-        cutoff_len = gr.Slider(label='Cutoff Length', minimum=0, maximum=2048, value=256, step=32, info='Cutoff length for text input. Essentially, how long of a line of text to feed in at a time. Higher values require drastically more VRAM.')
-
-        with gr.Tab(label='Formatted Dataset'):
+def create_ui():
+    with gr.Tab("Training", elem_id="training-tab"):
+        with gr.Tab('Train LoRA', elem_id='lora-train-tab'):
+            tmp = gr.State('')
             with gr.Row():
-                dataset = gr.Dropdown(choices=utils.get_datasets('training/datasets', 'json'), value='None', label='Dataset', info='The dataset file to use for training.')
-                ui.create_refresh_button(dataset, lambda: None, lambda: {'choices': utils.get_datasets('training/datasets', 'json')}, 'refresh-button')
-                eval_dataset = gr.Dropdown(choices=utils.get_datasets('training/datasets', 'json'), value='None', label='Evaluation Dataset', info='The (optional) dataset file used to evaluate the model after training.')
-                ui.create_refresh_button(eval_dataset, lambda: None, lambda: {'choices': utils.get_datasets('training/datasets', 'json')}, 'refresh-button')
-                format = gr.Dropdown(choices=utils.get_datasets('training/formats', 'json'), value='None', label='Data Format', info='The format file used to decide how to format the dataset input.')
-                ui.create_refresh_button(format, lambda: None, lambda: {'choices': utils.get_datasets('training/formats', 'json')}, 'refresh-button')
+                with gr.Column():
+                    gr.Markdown("[Tutorial](https://github.com/oobabooga/text-generation-webui/blob/main/docs/Training-LoRAs.md)")
 
-            eval_steps = gr.Number(label='Evaluate every n steps', value=100, info='If an evaluation dataset is given, test it every time this many steps pass.')
+                    with gr.Row():
+                        copy_from = gr.Dropdown(label='Copy parameters from', value='None', choices=utils.get_available_loras(), elem_classes=['slim-dropdown'])
+                        ui.create_refresh_button(copy_from, lambda: None, lambda: {'choices': utils.get_available_loras()}, 'refresh-button')
 
-        with gr.Tab(label="Raw text file"):
+                    with gr.Row():
+                        with gr.Column(scale=5):
+                            lora_name = gr.Textbox(label='Name', info='The name of your new LoRA file')
+                        with gr.Column():
+                            always_override = gr.Checkbox(label='Override Existing Files', value=False, info='If the name is the same, checking will replace the existing file, and unchecking will load and continue from it (the rank must be the same).', elem_classes=['no-background'])
+
+                    with gr.Row():
+                        with gr.Column():
+                            lora_rank = gr.Slider(label='LoRA Rank', value=32, minimum=0, maximum=1024, step=4, info='Also called dimension count. Higher values = larger file, more content control. Smaller values = smaller file, less control. Use 4 or 8 for style, 128 or 256 to teach, 1024+ for fine-detail on big data. More VRAM is needed for higher ranks.')
+                            lora_alpha = gr.Slider(label='LoRA Alpha', value=64, minimum=0, maximum=2048, step=4, info='This divided by the rank becomes the scaling of the LoRA. Higher means stronger. A good standard value is twice your Rank.')
+                            batch_size = gr.Slider(label='Batch Size', value=128, minimum=0, maximum=1024, step=4, info='Global batch size. The two batch sizes together determine gradient accumulation (gradientAccum = batch / microBatch). Higher gradient accum values lead to better quality training.')
+                            micro_batch_size = gr.Slider(label='Micro Batch Size', value=4, minimum=1, maximum=128, step=1, info='Per-device batch size (NOTE: multiple devices not yet implemented). Increasing this will increase VRAM usage.')
+                            cutoff_len = gr.Slider(label='Cutoff Length', minimum=0, maximum=2048, value=256, step=32, info='Cutoff length for text input. Essentially, how long of a line of text to feed in at a time. Higher values require drastically more VRAM.')
+
+                        with gr.Column():
+                            save_steps = gr.Number(label='Save every n steps', value=0, info='If above 0, a checkpoint of the LoRA will be saved every time this many steps pass.')
+
+                            epochs = gr.Number(label='Epochs', value=3, info='Number of times every entry in the dataset should be fed into training. So 1 means feed each item in once, 5 means feed it in five times, etc.')
+                            learning_rate = gr.Textbox(label='Learning Rate', value='3e-4', info='In scientific notation. 3e-4 is a good starting base point. 1e-2 is extremely high, 1e-6 is extremely low.')
+                            lr_scheduler_type = gr.Dropdown(label='LR Scheduler', value='linear', choices=['linear', 'constant', 'constant_with_warmup', 'cosine', 'cosine_with_restarts', 'polynomial', 'inverse_sqrt'], info='Learning rate scheduler - defines how the learning rate changes over time. "Constant" means never change, "linear" means to go in a straight line from the learning rate down to 0, cosine follows a curve, etc.', elem_classes=['slim-dropdown'])
+
+                    with gr.Accordion(label='Advanced Options', open=False):
+                        with gr.Row():
+                            with gr.Column():
+                                lora_dropout = gr.Slider(label='LoRA Dropout', minimum=0.0, maximum=1.0, step=0.025, value=0.05, info='Percentage probability for dropout of LoRA layers. This can help reduce overfitting. Most users should leave at default.')
+                                stop_at_loss = gr.Slider(label='Stop at loss', minimum=0.0, maximum=3.0, step=0.1, value=0.00, info='The process will automatically stop once the desired loss value is reached. (reasonable numbers are 1.5-1.8)')
+                                optimizer = gr.Dropdown(label='Optimizer', value='adamw_torch', choices=['adamw_hf', 'adamw_torch', 'adamw_torch_fused', 'adamw_torch_xla', 'adamw_apex_fused', 'adafactor', 'adamw_bnb_8bit', 'adamw_anyprecision', 'sgd', 'adagrad'], info='Different optimizer implementation options, for advanced users. Effects of different options are not well documented yet.', elem_classes=['slim-dropdown'])
+
+                            with gr.Column():
+                                warmup_steps = gr.Number(label='Warmup Steps', value=100, info='For this many steps at the start, the learning rate will be lower than normal. This helps the trainer prepare the model and precompute statistics to improve the quality of training after the start.')
+                                train_only_after = gr.Textbox(label='Train Only After', value='', info='Only consider text *after* this string in any given chunk for training. For Alpaca datasets, use "### Response:" to only train the response and ignore the input.')
+
+                                add_eos_token = gr.Checkbox(label='Add EOS token', value=False, info="Adds EOS token for each dataset item. In case of raw text, the EOS will be added at the Hard Cut")
+
+                                higher_rank_limit = gr.Checkbox(label='Enable higher ranks', value=False, info='If checked, changes Rank/Alpha slider above to go much higher. This will not work without a datacenter-class GPU.')
+                                report_to = gr.Radio(label="Save detailed logs with", value="None", choices=["None", "wandb", "tensorboard"], interactive=True)
+
+                with gr.Column():
+                    with gr.Tab(label='Formatted Dataset'):
+                        with gr.Row():
+                            format = gr.Dropdown(choices=utils.get_datasets('training/formats', 'json'), value='None', label='Data Format', info='The format file used to decide how to format the dataset input.', elem_classes=['slim-dropdown'])
+                            ui.create_refresh_button(format, lambda: None, lambda: {'choices': utils.get_datasets('training/formats', 'json')}, 'refresh-button')
+
+                        with gr.Row():
+                            dataset = gr.Dropdown(choices=utils.get_datasets('training/datasets', 'json'), value='None', label='Dataset', info='The dataset file to use for training.', elem_classes=['slim-dropdown'])
+                            ui.create_refresh_button(dataset, lambda: None, lambda: {'choices': utils.get_datasets('training/datasets', 'json')}, 'refresh-button')
+
+                        with gr.Row():
+                            eval_dataset = gr.Dropdown(choices=utils.get_datasets('training/datasets', 'json'), value='None', label='Evaluation Dataset', info='The (optional) dataset file used to evaluate the model after training.', elem_classes=['slim-dropdown'])
+                            ui.create_refresh_button(eval_dataset, lambda: None, lambda: {'choices': utils.get_datasets('training/datasets', 'json')}, 'refresh-button')
+
+                        eval_steps = gr.Number(label='Evaluate every n steps', value=100, info='If an evaluation dataset is given, test it every time this many steps pass.')
+
+                    with gr.Tab(label="Raw text file"):
+                        with gr.Row():
+                            raw_text_file = gr.Dropdown(choices=utils.get_datasets('training/datasets', 'txt'), value='None', label='Text file', info='The raw text file to use for training.', elem_classes=['slim-dropdown'])
+                            ui.create_refresh_button(raw_text_file, lambda: None, lambda: {'choices': utils.get_datasets('training/datasets', 'txt')}, 'refresh-button')
+
+                        with gr.Row():
+                            with gr.Column():
+                                overlap_len = gr.Slider(label='Overlap Length', minimum=0, maximum=512, value=128, step=16, info='How many tokens from the prior chunk of text to include into the next chunk. (The chunks themselves will be of a size determined by Cutoff Length). Setting overlap to exactly half the cutoff length may be ideal.')
+                                newline_favor_len = gr.Slider(label='Prefer Newline Cut Length', minimum=0, maximum=512, value=128, step=16, info='Length (in characters, not tokens) of the maximum distance to shift an overlap cut by to ensure chunks cut at newlines. If too low, cuts may occur in the middle of lines.')
+
+                            with gr.Column():
+                                hard_cut_string = gr.Textbox(label='Hard Cut String', value='\\n\\n\\n', info='String that indicates a hard cut between text parts. Helps prevent unwanted overlap.')
+                                min_chars = gr.Number(label='Ignore small blocks', value=0, info='Ignore Hard Cut blocks that have less or equal characters than this number')
+
+                    with gr.Row():
+                        start_button = gr.Button("Start LoRA Training", variant='primary')
+                        stop_button = gr.Button("Interrupt")
+
+                    output = gr.Markdown(value="Ready")
+
+        with gr.Tab('Perplexity evaluation', elem_id='evaluate-tab'):
             with gr.Row():
-                raw_text_file = gr.Dropdown(choices=utils.get_datasets('training/datasets', 'txt'), value='None', label='Text file', info='The raw text file to use for training.')
-                ui.create_refresh_button(raw_text_file, lambda: None, lambda: {'choices': utils.get_datasets('training/datasets', 'txt')}, 'refresh-button')
-                hard_cut_string = gr.Textbox(label='Hard Cut String', value='\\n\\n\\n', info='String that indicates a hard cut between text parts. Helps prevent unwanted overlap.')
-                min_chars = gr.Number(label='Ignore small blocks', value=0, info='Ignore Hard Cut blocks that have less or equal characters than this number')
+                with gr.Column():
+                    models = gr.Dropdown(utils.get_available_models(), label='Models', multiselect=True)
+                    evaluate_text_file = gr.Dropdown(choices=['wikitext', 'ptb', 'ptb_new'] + utils.get_datasets('training/datasets', 'txt')[1:], value='wikitext', label='Input dataset', info='The raw text file on which the model will be evaluated. The first options are automatically downloaded: wikitext, ptb, and ptb_new. The next options are your local text files under training/datasets.')
+                    with gr.Row():
+                        with gr.Column():
+                            stride_length = gr.Slider(label='Stride', minimum=1, maximum=2048, value=512, step=1, info='Used to make the evaluation faster at the cost of accuracy. 1 = slowest but most accurate. 512 is a common value.')
 
+                        with gr.Column():
+                            max_length = gr.Slider(label='max_length', minimum=0, maximum=8096, value=0, step=1, info='The context for each evaluation. If set to 0, the maximum context length for the model will be used.')
+
+                    with gr.Row():
+                        start_current_evaluation = gr.Button("Evaluate loaded model")
+                        start_evaluation = gr.Button("Evaluate selected models")
+                        stop_evaluation = gr.Button("Interrupt")
+
+                with gr.Column():
+                    evaluation_log = gr.Markdown(value='')
+
+            evaluation_table = gr.Dataframe(value=generate_markdown_table(), interactive=True)
             with gr.Row():
-                overlap_len = gr.Slider(label='Overlap Length', minimum=0, maximum=512, value=128, step=16, info='Overlap length - ie how many tokens from the prior chunk of text to include into the next chunk. (The chunks themselves will be of a size determined by Cutoff Length below). Setting overlap to exactly half the cutoff length may be ideal.')
-                newline_favor_len = gr.Slider(label='Prefer Newline Cut Length', minimum=0, maximum=512, value=128, step=16, info='Length (in characters, not tokens) of the maximum distance to shift an overlap cut by to ensure chunks cut at newlines. If too low, cuts may occur in the middle of lines.')
-
-        with gr.Accordion(label='Advanced Options', open=False):
-            lora_dropout = gr.Slider(label='LoRA Dropout', minimum=0.0, maximum=1.0, step=0.025, value=0.05, info='Percentage probability for dropout of LoRA layers. This can help reduce overfitting. Most users should leave at default.')
-            warmup_steps = gr.Number(label='Warmup Steps', value=100, info='For this many steps at the start, the learning rate will be lower than normal. This helps the trainer prepare the model and precompute statistics to improve the quality of training after the start.')
-            optimizer = gr.Dropdown(label='Optimizer', value='adamw_torch', choices=['adamw_hf', 'adamw_torch', 'adamw_torch_fused', 'adamw_torch_xla', 'adamw_apex_fused', 'adafactor', 'adamw_bnb_8bit', 'adamw_anyprecision', 'sgd', 'adagrad'], info='Different optimizer implementation options, for advanced users. Effects of different options are not well documented yet.')
-            train_only_after = gr.Textbox(label='Train Only After', value='', info='Only consider text *after* this string in any given chunk for training. For Alpaca datasets, use "### Response:" to only train the response and ignore the input.')
-            stop_at_loss = gr.Slider(label='Stop at loss', minimum=0.0, maximum=3.0, step=0.1, value=0.00, info='The process will automatically stop once the desired loss value is reached. (reasonable numbers are 1.5-1.8)')
-            add_eos_token = gr.Checkbox(label='Add EOS token', value=False, info="Adds EOS token for each dataset item. In case of raw text, the EOS will be added at the Hard Cut")
-
-            with gr.Row():
-                higher_rank_limit = gr.Checkbox(label='Enable higher ranks', value=False, info='If checked, changes Rank/Alpha slider above to go much higher. This will not work without a datacenter-class GPU.')
-            with gr.Row():
-                report_to = gr.Radio(label="Save detailed logs with", value="None", choices=["None", "wandb", "tensorboard"], interactive=True)
-
-        with gr.Row():
-            start_button = gr.Button("Start LoRA Training")
-            stop_button = gr.Button("Interrupt")
-
-        output = gr.Markdown(value="Ready")
-
-    with gr.Tab('Perplexity evaluation', elem_id='evaluate-tab'):
-        with gr.Row():
-            with gr.Column():
-                models = gr.Dropdown(utils.get_available_models(), label='Models', multiselect=True)
-                evaluate_text_file = gr.Dropdown(choices=['wikitext', 'ptb', 'ptb_new'] + utils.get_datasets('training/datasets', 'txt')[1:], value='wikitext', label='Input dataset', info='The raw text file on which the model will be evaluated. The first options are automatically downloaded: wikitext, ptb, and ptb_new. The next options are your local text files under training/datasets.')
-                with gr.Row():
-                    stride_length = gr.Slider(label='Stride', minimum=1, maximum=2048, value=512, step=1, info='Used to make the evaluation faster at the cost of accuracy. 1 = slowest but most accurate. 512 is a common value.')
-                    max_length = gr.Slider(label='max_length', minimum=0, maximum=8096, value=0, step=1, info='The context for each evaluation. If set to 0, the maximum context length for the model will be used.')
-
-                with gr.Row():
-                    start_current_evaluation = gr.Button("Evaluate loaded model")
-                    start_evaluation = gr.Button("Evaluate selected models")
-                    stop_evaluation = gr.Button("Interrupt")
-
-            with gr.Column():
-                evaluation_log = gr.Markdown(value='')
-
-        evaluation_table = gr.Dataframe(value=generate_markdown_table(), interactive=True)
-        with gr.Row():
-            save_comments = gr.Button('Save comments', elem_classes="small-button")
-            refresh_table = gr.Button('Refresh the table', elem_classes="small-button")
+                save_comments = gr.Button('Save comments', elem_classes="small-button")
+                refresh_table = gr.Button('Refresh the table', elem_classes="small-button")
 
     # Training events
-
     all_params = [lora_name, always_override, save_steps, micro_batch_size, batch_size, epochs, learning_rate, lr_scheduler_type, lora_rank, lora_alpha, lora_dropout, cutoff_len, dataset, eval_dataset, format, eval_steps, raw_text_file, overlap_len, newline_favor_len, higher_rank_limit, warmup_steps, optimizer, hard_cut_string, train_only_after, stop_at_loss, add_eos_token, min_chars, report_to]
 
     copy_from.change(do_copy_params, [copy_from] + all_params, all_params)
@@ -172,7 +172,6 @@ def create_train_interface():
     ev = start_evaluation.click(calculate_perplexity, [models, evaluate_text_file, stride_length, max_length], evaluation_log, show_progress=False)
     start_evaluation.click(generate_markdown_table, None, evaluation_table, show_progress=False)
 
-    tmp = gr.State('')
     start_current_evaluation.click(lambda: ['current model'], None, tmp)
     ev_cur = start_current_evaluation.click(calculate_perplexity, [tmp, evaluate_text_file, stride_length, max_length], evaluation_log, show_progress=False)
     start_current_evaluation.click(generate_markdown_table, None, evaluation_table, show_progress=False)
@@ -215,8 +214,6 @@ def change_rank_limit(use_higher_ranks: bool):
 
 def clean_path(base_path: str, path: str):
     """Strips unusual symbols and forcibly builds a path as relative to the intended directory."""
-    # TODO: Probably could do with a security audit to guarantee there's no ways this can be bypassed to target an unwanted path.
-    # Or swap it to a strict whitelist of [a-zA-Z_0-9]
     path = path.replace('\\', '/').replace('..', '_')
     if base_path is None:
         return path
@@ -281,13 +278,13 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
     WANT_INTERRUPT = False
 
     # == Input validation / processing ==
-    yield "Prepping..."
+    yield "Preparing the input..."
     lora_file_path = clean_path(None, lora_name)
     if lora_file_path.strip() == '':
         yield "Missing or invalid LoRA file name input."
         return
 
-    lora_file_path = f"{shared.args.lora_dir}/{lora_file_path}"
+    lora_file_path = f"{Path(shared.args.lora_dir)}/{lora_file_path}"
     actual_lr = float(learning_rate)
     model_type = type(shared.model).__name__
 
@@ -308,15 +305,10 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
 
         time.sleep(5)
 
-    if shared.args.wbits > 0 and not shared.args.monkey_patch:
-        yield "LoRA training with GPTQ models requires loading with `--monkey-patch`"
+    if shared.args.loader == 'GPTQ-for-LLaMa' and not shared.args.monkey_patch:
+        yield "LoRA training with GPTQ-for-LLaMa requires loading with `--monkey-patch`"
         return
 
-    elif not (shared.args.load_in_8bit or shared.args.load_in_4bit) and shared.args.wbits <= 0:
-        yield "It is highly recommended you use `--load-in-8bit` for LoRA training. *(Will continue anyway in 2 seconds, press `Interrupt` to stop.)*"
-        logger.warning("It is highly recommended you use `--load-in-8bit` for LoRA training.")
-        time.sleep(2)  # Give it a moment for the message to show in UI before continuing
-
     if cutoff_len <= 0 or micro_batch_size <= 0 or batch_size <= 0 or actual_lr <= 0 or lora_rank <= 0 or lora_alpha <= 0:
         yield "Cannot input zeroes."
         return
@@ -403,7 +395,6 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
             logger.warning("EOS and BOS tokens are identical when adding EOS tokens. Check model config.")
 
         for text_part in raw_text.split(cut_string):
-
             if len(text_part.strip()) <= min_chars:
                 continue
 
@@ -430,11 +421,11 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
         eval_data = None
     else:
         if dataset in ['None', '']:
-            yield "**Missing dataset choice input, cannot continue.**"
+            yield "Missing dataset choice input, cannot continue."
             return
 
         if format in ['None', '']:
-            yield "**Missing format choice input, cannot continue.**"
+            yield "Missing format choice input, cannot continue."
             return
 
         train_template["template_type"] = "dataset"
@@ -477,8 +468,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
             print("\033[1;31;1m(Model has been modified by previous training, it needs to be reloaded...)\033[0;37;0m")
             try:
                 yield f"Reloading {selected_model}..."
-                unload_model()
-                shared.model, shared.tokenizer = load_model(shared.model_name, None)
+                reload_model()
                 if shared.model is not None:
                     print("Model reloaded OK, continue with training.")
                 else:
@@ -487,17 +477,17 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
                 exc = traceback.format_exc()
                 logger.error('Failed to reload the model.')
                 print(exc)
-                return exc
+                return exc.replace('\n', '\n\n')
 
     # == Start prepping the model itself ==
     if not hasattr(shared.model, 'lm_head') or hasattr(shared.model.lm_head, 'weight'):
         logger.info("Getting model ready...")
-        prepare_model_for_int8_training(shared.model)
+        prepare_model_for_kbit_training(shared.model)
 
     # base model is now frozen and should not be reused for any other LoRA training than this one
     shared.model_dirty_from_training = True
 
-    logger.info("Prepping for training...")
+    logger.info("Preparing for training...")
     config = LoraConfig(
         r=lora_rank,
         lora_alpha=lora_alpha,
@@ -522,7 +512,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
             state_dict_peft = torch.load(f"{lora_file_path}/adapter_model.bin")
             set_peft_model_state_dict(lora_model, state_dict_peft)
     except:
-        yield traceback.format_exc()
+        yield traceback.format_exc().replace('\n', '\n\n')
         return
 
     if shared.args.monkey_patch:
@@ -708,10 +698,10 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
 
     if WANT_INTERRUPT:
         logger.info("Training interrupted.")
-        yield f"Interrupted. Incomplete LoRA saved to `{lora_file_path}`"
+        yield f"Interrupted. Incomplete LoRA saved to `{lora_file_path}`."
     else:
         logger.info("Training complete!")
-        yield f"Done! LoRA saved to `{lora_file_path}`"
+        yield f"Done! LoRA saved to `{lora_file_path}`.\n\nBefore testing your new LoRA, make sure to first reload the model, as it is currently dirty from training."
 
 
 def split_chunks(arr, size: int, step: int, max_newline_length: int, newline_tokens: set):
diff --git a/modules/ui.py b/modules/ui.py
index d9b3a131..aa72f287 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -1,20 +1,23 @@
-import json
+import copy
 from pathlib import Path
 
 import gradio as gr
 import torch
+import yaml
 
 from modules import shared
 
 
 with open(Path(__file__).resolve().parent / '../css/main.css', 'r') as f:
     css = f.read()
-with open(Path(__file__).resolve().parent / '../css/chat.css', 'r') as f:
-    chat_css = f.read()
-with open(Path(__file__).resolve().parent / '../css/main.js', 'r') as f:
-    main_js = f.read()
-with open(Path(__file__).resolve().parent / '../css/chat.js', 'r') as f:
-    chat_js = f.read()
+with open(Path(__file__).resolve().parent / '../js/main.js', 'r') as f:
+    js = f.read()
+with open(Path(__file__).resolve().parent / '../js/save_files.js', 'r') as f:
+    save_files_js = f.read()
+with open(Path(__file__).resolve().parent / '../js/switch_tabs.js', 'r') as f:
+    switch_tabs_js = f.read()
+with open(Path(__file__).resolve().parent / '../js/show_controls.js', 'r') as f:
+    show_controls_js = f.read()
 
 refresh_symbol = '🔄'
 delete_symbol = '🗑️'
@@ -30,6 +33,11 @@ theme = gr.themes.Default(
     background_fill_secondary='#eaeaea'
 )
 
+if Path("notification.mp3").exists():
+    audio_notification_js = "document.querySelector('#audio_notification audio')?.play();"
+else:
+    audio_notification_js = ""
+
 
 def list_model_elements():
     elements = [
@@ -54,12 +62,16 @@ def list_model_elements():
         'no_inject_fused_attention',
         'no_inject_fused_mlp',
         'no_use_cuda_fp16',
+        'disable_exllama',
+        'cfg_cache',
         'threads',
         'n_batch',
         'no_mmap',
         'low_vram',
         'mlock',
+        'mul_mat_q',
         'n_gpu_layers',
+        'tensor_split',
         'n_ctx',
         'n_gqa',
         'rms_norm_eps',
@@ -67,7 +79,8 @@ def list_model_elements():
         'gpu_split',
         'max_seq_len',
         'compress_pos_emb',
-        'alpha_value'
+        'alpha_value',
+        'rope_freq_base'
     ]
 
     for i in range(torch.cuda.device_count()):
@@ -79,6 +92,7 @@ def list_model_elements():
 def list_interface_input_elements():
     elements = [
         'max_new_tokens',
+        'auto_max_new_tokens',
         'seed',
         'temperature',
         'top_p',
@@ -99,6 +113,8 @@ def list_interface_input_elements():
         'mirostat_mode',
         'mirostat_tau',
         'mirostat_eta',
+        'negative_prompt',
+        'guidance_scale',
         'add_bos_token',
         'ban_eos_token',
         'truncation_length',
@@ -109,31 +125,38 @@ def list_interface_input_elements():
         'top_a',
     ]
 
-    if shared.args.chat:
-        elements += [
-            'character_menu',
-            'history',
-            'name1',
-            'name2',
-            'greeting',
-            'context',
-            'chat_generation_attempts',
-            'stop_at_newline',
-            'mode',
-            'instruction_template',
-            'name1_instruct',
-            'name2_instruct',
-            'context_instruct',
-            'turn_template',
-            'chat_style',
-            'chat-instruct_command',
-        ]
-    else:
-        elements.append('textbox')
-        if not shared.args.notebook:
-            elements.append('output_textbox')
+    # Chat elements
+    elements += [
+        'textbox',
+        'start_with',
+        'character_menu',
+        'history',
+        'name1',
+        'name2',
+        'greeting',
+        'context',
+        'mode',
+        'instruction_template',
+        'name1_instruct',
+        'name2_instruct',
+        'context_instruct',
+        'turn_template',
+        'chat_style',
+        'chat-instruct_command',
+    ]
 
+    # Notebook/default elements
+    elements += [
+        'textbox-notebook',
+        'textbox-default',
+        'output_textbox',
+        'prompt_menu-default',
+        'prompt_menu-notebook',
+    ]
+
+    # Model elements
     elements += list_model_elements()
+
     return elements
 
 
@@ -144,9 +167,6 @@ def gather_interface_values(*args):
 
     if not shared.args.multi_user:
         shared.persistent_interface_state = output
-        Path('logs').mkdir(exist_ok=True)
-        with open(Path(f'logs/session_{shared.get_mode()}_autosave.json'), 'w') as f:
-            f.write(json.dumps(output, indent=4))
 
     return output
 
@@ -162,6 +182,25 @@ def apply_interface_values(state, use_persistent=False):
         return [state[k] if k in state else gr.update() for k in elements]
 
 
+def save_settings(state, preset, instruction_template, extensions, show_controls):
+    output = copy.deepcopy(shared.settings)
+    exclude = ['name1', 'name2', 'greeting', 'context', 'turn_template']
+    for k in state:
+        if k in shared.settings and k not in exclude:
+            output[k] = state[k]
+
+    output['preset'] = preset
+    output['prompt-default'] = state['prompt_menu-default']
+    output['prompt-notebook'] = state['prompt_menu-notebook']
+    output['character'] = state['character_menu']
+    output['instruction_template'] = instruction_template
+    output['default_extensions'] = extensions
+    output['seed'] = int(output['seed'])
+    output['show_controls'] = show_controls
+
+    return yaml.dump(output, sort_keys=False, width=float("inf"))
+
+
 class ToolButton(gr.Button, gr.components.IOComponent):
     """
     Small button with single emoji as text, fits inside gradio forms
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
new file mode 100644
index 00000000..72e8cd03
--- /dev/null
+++ b/modules/ui_chat.py
@@ -0,0 +1,292 @@
+import json
+from functools import partial
+from pathlib import Path
+
+import gradio as gr
+from PIL import Image
+
+from modules import chat, prompts, shared, ui, utils
+from modules.html_generator import chat_html_wrapper
+from modules.text_generation import stop_everything_event
+from modules.utils import gradio
+
+inputs = ('Chat input', 'interface_state')
+reload_arr = ('history', 'name1', 'name2', 'mode', 'chat_style')
+clear_arr = ('Clear history-confirm', 'Clear history', 'Clear history-cancel')
+
+
+def create_ui():
+    shared.gradio['Chat input'] = gr.State()
+    shared.gradio['dummy'] = gr.State()
+    shared.gradio['history'] = gr.State({'internal': [], 'visible': []})
+
+    with gr.Tab('Chat', elem_id='chat-tab'):
+        shared.gradio['display'] = gr.HTML(value=chat_html_wrapper({'internal': [], 'visible': []}, shared.settings['name1'], shared.settings['name2'], 'chat', 'cai-chat'))
+        shared.gradio['textbox'] = gr.Textbox(label='', placeholder='Send a message', elem_id='chat-input')
+        shared.gradio['show_controls'] = gr.Checkbox(value=shared.settings['show_controls'], label='Show controls', elem_id='show-controls')
+
+        with gr.Row():
+            shared.gradio['Stop'] = gr.Button('Stop', elem_id='stop')
+            shared.gradio['Generate'] = gr.Button('Generate', elem_id='Generate', variant='primary')
+            shared.gradio['Continue'] = gr.Button('Continue')
+
+        with gr.Row():
+            shared.gradio['Impersonate'] = gr.Button('Impersonate')
+            shared.gradio['Regenerate'] = gr.Button('Regenerate')
+            shared.gradio['Remove last'] = gr.Button('Remove last', elem_classes=['button_nowrap'])
+
+        with gr.Row():
+            shared.gradio['Copy last reply'] = gr.Button('Copy last reply')
+            shared.gradio['Replace last reply'] = gr.Button('Replace last reply')
+            shared.gradio['Send dummy message'] = gr.Button('Send dummy message')
+            shared.gradio['Send dummy reply'] = gr.Button('Send dummy reply')
+
+        with gr.Row():
+            shared.gradio['Clear history'] = gr.Button('Clear history')
+            shared.gradio['Clear history-confirm'] = gr.Button('Confirm', variant='stop', visible=False)
+            shared.gradio['Clear history-cancel'] = gr.Button('Cancel', visible=False)
+
+        with gr.Row():
+            shared.gradio['send-chat-to-default'] = gr.Button('Send to default')
+            shared.gradio['send-chat-to-notebook'] = gr.Button('Send to notebook')
+
+        with gr.Row():
+            shared.gradio['start_with'] = gr.Textbox(label='Start reply with', placeholder='Sure thing!', value=shared.settings['start_with'])
+
+        with gr.Row():
+            shared.gradio['mode'] = gr.Radio(choices=['chat', 'chat-instruct', 'instruct'], value=shared.settings['mode'] if shared.settings['mode'] in ['chat', 'instruct', 'chat-instruct'] else 'chat', label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template selected under Parameters > Instruction template must match the current model.', elem_id='chat-mode')
+            shared.gradio['chat_style'] = gr.Dropdown(choices=utils.get_available_chat_styles(), label='Chat style', value=shared.settings['chat_style'], visible=shared.settings['mode'] != 'instruct')
+
+
+def create_chat_settings_ui():
+    with gr.Tab('Character'):
+        with gr.Row():
+            with gr.Column(scale=8):
+                with gr.Row():
+                    shared.gradio['character_menu'] = gr.Dropdown(value='None', choices=utils.get_available_characters(), label='Character', elem_id='character-menu', info='Used in chat and chat-instruct modes.', elem_classes='slim-dropdown')
+                    ui.create_refresh_button(shared.gradio['character_menu'], lambda: None, lambda: {'choices': utils.get_available_characters()}, 'refresh-button')
+                    shared.gradio['save_character'] = gr.Button('💾', elem_classes='refresh-button')
+                    shared.gradio['delete_character'] = gr.Button('🗑️', elem_classes='refresh-button')
+
+                shared.gradio['name1'] = gr.Textbox(value=shared.settings['name1'], lines=1, label='Your name')
+                shared.gradio['name2'] = gr.Textbox(value=shared.settings['name2'], lines=1, label='Character\'s name')
+                shared.gradio['context'] = gr.Textbox(value=shared.settings['context'], lines=10, label='Context', elem_classes=['add_scrollbar'])
+                shared.gradio['greeting'] = gr.Textbox(value=shared.settings['greeting'], lines=5, label='Greeting', elem_classes=['add_scrollbar'])
+
+            with gr.Column(scale=1):
+                shared.gradio['character_picture'] = gr.Image(label='Character picture', type='pil')
+                shared.gradio['your_picture'] = gr.Image(label='Your picture', type='pil', value=Image.open(Path('cache/pfp_me.png')) if Path('cache/pfp_me.png').exists() else None)
+
+    with gr.Tab('Instruction template'):
+        with gr.Row():
+            with gr.Row():
+                shared.gradio['instruction_template'] = gr.Dropdown(choices=utils.get_available_instruction_templates(), label='Instruction template', value='None', info='Change this according to the model/LoRA that you are using. Used in instruct and chat-instruct modes.', elem_classes='slim-dropdown')
+                ui.create_refresh_button(shared.gradio['instruction_template'], lambda: None, lambda: {'choices': utils.get_available_instruction_templates()}, 'refresh-button')
+                shared.gradio['save_template'] = gr.Button('💾', elem_classes='refresh-button')
+                shared.gradio['delete_template'] = gr.Button('🗑️ ', elem_classes='refresh-button')
+
+        shared.gradio['name1_instruct'] = gr.Textbox(value='', lines=2, label='User string')
+        shared.gradio['name2_instruct'] = gr.Textbox(value='', lines=1, label='Bot string')
+        shared.gradio['context_instruct'] = gr.Textbox(value='', lines=4, label='Context')
+        shared.gradio['turn_template'] = gr.Textbox(value='', lines=1, label='Turn template', info='Used to precisely define the placement of spaces and new line characters in instruction prompts.')
+        with gr.Row():
+            shared.gradio['send_instruction_to_default'] = gr.Button('Send to default', elem_classes=['small-button'])
+            shared.gradio['send_instruction_to_notebook'] = gr.Button('Send to notebook', elem_classes=['small-button'])
+            shared.gradio['send_instruction_to_negative_prompt'] = gr.Button('Send to negative prompt', elem_classes=['small-button'])
+
+        with gr.Row():
+            shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=4, label='Command for chat-instruct mode', info='<|character|> gets replaced by the bot name, and <|prompt|> gets replaced by the regular chat prompt.', elem_classes=['add_scrollbar'])
+
+    with gr.Tab('Chat history'):
+        with gr.Row():
+            with gr.Column():
+                shared.gradio['save_chat_history'] = gr.Button(value='Save history')
+
+            with gr.Column():
+                shared.gradio['load_chat_history'] = gr.File(type='binary', file_types=['.json', '.txt'], label='Upload History JSON')
+
+    with gr.Tab('Upload character'):
+        with gr.Tab('YAML or JSON'):
+            with gr.Row():
+                shared.gradio['upload_json'] = gr.File(type='binary', file_types=['.json', '.yaml'], label='JSON or YAML File')
+                shared.gradio['upload_img_bot'] = gr.Image(type='pil', label='Profile Picture (optional)')
+
+            shared.gradio['Submit character'] = gr.Button(value='Submit', interactive=False)
+
+        with gr.Tab('TavernAI PNG'):
+            with gr.Row():
+                with gr.Column():
+                    shared.gradio['upload_img_tavern'] = gr.Image(type='pil', label='TavernAI PNG File', elem_id='upload_img_tavern')
+                    shared.gradio['tavern_json'] = gr.State()
+                with gr.Column():
+                    shared.gradio['tavern_name'] = gr.Textbox(value='', lines=1, label='Name', interactive=False)
+                    shared.gradio['tavern_desc'] = gr.Textbox(value='', lines=4, max_lines=4, label='Description', interactive=False)
+
+            shared.gradio['Submit tavern character'] = gr.Button(value='Submit', interactive=False)
+
+
+def create_event_handlers():
+
+    # Obsolete variables, kept for compatibility with old extensions
+    shared.input_params = gradio(inputs)
+    shared.reload_inputs = gradio(reload_arr)
+
+    shared.gradio['Generate'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda x: (x, ''), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then(
+        chat.generate_chat_reply_wrapper, gradio(inputs), gradio('display', 'history'), show_progress=False).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
+        lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
+
+    shared.gradio['textbox'].submit(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda x: (x, ''), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then(
+        chat.generate_chat_reply_wrapper, gradio(inputs), gradio('display', 'history'), show_progress=False).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
+        lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
+
+    shared.gradio['Regenerate'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        partial(chat.generate_chat_reply_wrapper, regenerate=True), gradio(inputs), gradio('display', 'history'), show_progress=False).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
+        lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
+
+    shared.gradio['Continue'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        partial(chat.generate_chat_reply_wrapper, _continue=True), gradio(inputs), gradio('display', 'history'), show_progress=False).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
+        lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
+
+    shared.gradio['Impersonate'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda x: x, gradio('textbox'), gradio('Chat input'), show_progress=False).then(
+        chat.impersonate_wrapper, gradio(inputs), gradio('textbox'), show_progress=False).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
+
+    shared.gradio['Replace last reply'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        chat.replace_last_reply, gradio('textbox', 'interface_state'), gradio('history')).then(
+        lambda: '', None, gradio('textbox'), show_progress=False).then(
+        chat.redraw_html, gradio(reload_arr), gradio('display')).then(
+        chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None)
+
+    shared.gradio['Send dummy message'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        chat.send_dummy_message, gradio('textbox', 'interface_state'), gradio('history')).then(
+        lambda: '', None, gradio('textbox'), show_progress=False).then(
+        chat.redraw_html, gradio(reload_arr), gradio('display')).then(
+        chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None)
+
+    shared.gradio['Send dummy reply'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        chat.send_dummy_reply, gradio('textbox', 'interface_state'), gradio('history')).then(
+        lambda: '', None, gradio('textbox'), show_progress=False).then(
+        chat.redraw_html, gradio(reload_arr), gradio('display')).then(
+        chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None)
+
+    shared.gradio['Clear history'].click(lambda: [gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)], None, gradio(clear_arr))
+    shared.gradio['Clear history-cancel'].click(lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, gradio(clear_arr))
+    shared.gradio['Clear history-confirm'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, gradio(clear_arr)).then(
+        chat.clear_chat_log, gradio('interface_state'), gradio('history')).then(
+        chat.redraw_html, gradio(reload_arr), gradio('display')).then(
+        chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None)
+
+    shared.gradio['Remove last'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        chat.remove_last_message, gradio('history'), gradio('textbox', 'history'), show_progress=False).then(
+        chat.redraw_html, gradio(reload_arr), gradio('display')).then(
+        chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None)
+
+    shared.gradio['character_menu'].change(
+        partial(chat.load_character, instruct=False), gradio('character_menu', 'name1', 'name2'), gradio('name1', 'name2', 'character_picture', 'greeting', 'context', 'dummy')).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        chat.load_persistent_history, gradio('interface_state'), gradio('history')).then(
+        chat.redraw_html, gradio(reload_arr), gradio('display'))
+
+    shared.gradio['Stop'].click(
+        stop_everything_event, None, None, queue=False).then(
+        chat.redraw_html, gradio(reload_arr), gradio('display'))
+
+    shared.gradio['mode'].change(
+        lambda x: gr.update(visible=x != 'instruct'), gradio('mode'), gradio('chat_style'), show_progress=False).then(
+        chat.redraw_html, gradio(reload_arr), gradio('display'))
+
+    shared.gradio['chat_style'].change(chat.redraw_html, gradio(reload_arr), gradio('display'))
+    shared.gradio['instruction_template'].change(
+        partial(chat.load_character, instruct=True), gradio('instruction_template', 'name1_instruct', 'name2_instruct'), gradio('name1_instruct', 'name2_instruct', 'dummy', 'dummy', 'context_instruct', 'turn_template'))
+
+    shared.gradio['load_chat_history'].upload(
+        chat.load_history, gradio('load_chat_history', 'history'), gradio('history')).then(
+        chat.redraw_html, gradio(reload_arr), gradio('display')).then(
+        lambda: None, None, None, _js=f'() => {{{ui.switch_tabs_js}; switch_to_chat()}}')
+
+    shared.gradio['Copy last reply'].click(chat.send_last_reply_to_input, gradio('history'), gradio('textbox'), show_progress=False)
+
+    # Save/delete a character
+    shared.gradio['save_character'].click(
+        lambda x: x, gradio('name2'), gradio('save_character_filename')).then(
+        lambda: gr.update(visible=True), None, gradio('character_saver'))
+
+    shared.gradio['delete_character'].click(lambda: gr.update(visible=True), None, gradio('character_deleter'))
+
+    shared.gradio['save_template'].click(
+        lambda: 'My Template.yaml', None, gradio('save_filename')).then(
+        lambda: 'instruction-templates/', None, gradio('save_root')).then(
+        chat.generate_instruction_template_yaml, gradio('name1_instruct', 'name2_instruct', 'context_instruct', 'turn_template'), gradio('save_contents')).then(
+        lambda: gr.update(visible=True), None, gradio('file_saver'))
+
+    shared.gradio['delete_template'].click(
+        lambda x: f'{x}.yaml', gradio('instruction_template'), gradio('delete_filename')).then(
+        lambda: 'instruction-templates/', None, gradio('delete_root')).then(
+        lambda: gr.update(visible=True), None, gradio('file_deleter'))
+
+    shared.gradio['save_chat_history'].click(
+        lambda x: json.dumps(x, indent=4), gradio('history'), gradio('temporary_text')).then(
+        None, gradio('temporary_text', 'character_menu', 'mode'), None, _js=f'(hist, char, mode) => {{{ui.save_files_js}; saveHistory(hist, char, mode)}}')
+
+    shared.gradio['Submit character'].click(
+        chat.upload_character, gradio('upload_json', 'upload_img_bot'), gradio('character_menu')).then(
+        lambda: None, None, None, _js=f'() => {{{ui.switch_tabs_js}; switch_to_character()}}')
+
+    shared.gradio['Submit tavern character'].click(
+        chat.upload_tavern_character, gradio('upload_img_tavern', 'tavern_json'), gradio('character_menu')).then(
+        lambda: None, None, None, _js=f'() => {{{ui.switch_tabs_js}; switch_to_character()}}')
+
+    shared.gradio['upload_json'].upload(lambda: gr.update(interactive=True), None, gradio('Submit character'))
+    shared.gradio['upload_json'].clear(lambda: gr.update(interactive=False), None, gradio('Submit character'))
+    shared.gradio['upload_img_tavern'].upload(chat.check_tavern_character, gradio('upload_img_tavern'), gradio('tavern_name', 'tavern_desc', 'tavern_json', 'Submit tavern character'), show_progress=False)
+    shared.gradio['upload_img_tavern'].clear(lambda: (None, None, None, gr.update(interactive=False)), None, gradio('tavern_name', 'tavern_desc', 'tavern_json', 'Submit tavern character'), show_progress=False)
+    shared.gradio['your_picture'].change(
+        chat.upload_your_profile_picture, gradio('your_picture'), None).then(
+        partial(chat.redraw_html, reset_cache=True), gradio(reload_arr), gradio('display'))
+
+    shared.gradio['send_instruction_to_default'].click(
+        prompts.load_instruction_prompt_simple, gradio('instruction_template'), gradio('textbox-default')).then(
+        lambda: None, None, None, _js=f'() => {{{ui.switch_tabs_js}; switch_to_default()}}')
+
+    shared.gradio['send_instruction_to_notebook'].click(
+        prompts.load_instruction_prompt_simple, gradio('instruction_template'), gradio('textbox-notebook')).then(
+        lambda: None, None, None, _js=f'() => {{{ui.switch_tabs_js}; switch_to_notebook()}}')
+
+    shared.gradio['send_instruction_to_negative_prompt'].click(
+        prompts.load_instruction_prompt_simple, gradio('instruction_template'), gradio('negative_prompt')).then(
+        lambda: None, None, None, _js=f'() => {{{ui.switch_tabs_js}; switch_to_generation_parameters()}}')
+
+    shared.gradio['send-chat-to-default'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        partial(chat.generate_chat_prompt, '', _continue=True), gradio('interface_state'), gradio('textbox-default')).then(
+        lambda: None, None, None, _js=f'() => {{{ui.switch_tabs_js}; switch_to_default()}}')
+
+    shared.gradio['send-chat-to-notebook'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        partial(chat.generate_chat_prompt, '', _continue=True), gradio('interface_state'), gradio('textbox-notebook')).then(
+        lambda: None, None, None, _js=f'() => {{{ui.switch_tabs_js}; switch_to_notebook()}}')
+
+    shared.gradio['show_controls'].change(None, gradio('show_controls'), None, _js=f'(x) => {{{ui.show_controls_js}; toggle_controls(x)}}')
diff --git a/modules/ui_default.py b/modules/ui_default.py
new file mode 100644
index 00000000..29b9bee5
--- /dev/null
+++ b/modules/ui_default.py
@@ -0,0 +1,96 @@
+import gradio as gr
+
+from modules import logits, shared, ui, utils
+from modules.prompts import count_tokens, load_prompt
+from modules.text_generation import (
+    generate_reply_wrapper,
+    stop_everything_event
+)
+from modules.utils import gradio
+
+inputs = ('textbox-default', 'interface_state')
+outputs = ('output_textbox', 'html-default')
+
+
+def create_ui():
+    with gr.Tab('Default', elem_id='default-tab'):
+        shared.gradio['last_input-default'] = gr.State('')
+        with gr.Row():
+            with gr.Column():
+                with gr.Row():
+                    shared.gradio['textbox-default'] = gr.Textbox(value='', elem_classes=['textbox_default', 'add_scrollbar'], lines=27, label='Input')
+                    shared.gradio['token-counter-default'] = gr.HTML(value="<span>0</span>", elem_classes=["token-counter", "default-token-counter"])
+
+                with gr.Row():
+                    shared.gradio['Generate-default'] = gr.Button('Generate', variant='primary')
+                    shared.gradio['Stop-default'] = gr.Button('Stop', elem_id='stop')
+                    shared.gradio['Continue-default'] = gr.Button('Continue')
+
+                with gr.Row():
+                    shared.gradio['prompt_menu-default'] = gr.Dropdown(choices=utils.get_available_prompts(), value='None', label='Prompt', elem_classes='slim-dropdown')
+                    ui.create_refresh_button(shared.gradio['prompt_menu-default'], lambda: None, lambda: {'choices': utils.get_available_prompts()}, 'refresh-button')
+                    shared.gradio['save_prompt-default'] = gr.Button('💾', elem_classes='refresh-button')
+                    shared.gradio['delete_prompt-default'] = gr.Button('🗑️', elem_classes='refresh-button')
+
+            with gr.Column():
+                with gr.Tab('Raw'):
+                    shared.gradio['output_textbox'] = gr.Textbox(lines=27, label='Output', elem_classes=['textbox_default_output', 'add_scrollbar'])
+
+                with gr.Tab('Markdown'):
+                    shared.gradio['markdown_render-default'] = gr.Button('Render')
+                    shared.gradio['markdown-default'] = gr.Markdown()
+
+                with gr.Tab('HTML'):
+                    shared.gradio['html-default'] = gr.HTML()
+
+                with gr.Tab('Logits'):
+                    with gr.Row():
+                        with gr.Column(scale=10):
+                            shared.gradio['get_logits-default'] = gr.Button('Get next token probabilities')
+                        with gr.Column(scale=1):
+                            shared.gradio['use_samplers-default'] = gr.Checkbox(label='Use samplers', value=True, elem_classes=['no-background'])
+
+                    with gr.Row():
+                        shared.gradio['logits-default'] = gr.Textbox(lines=23, label='Output', elem_classes=['textbox_logits', 'add_scrollbar'])
+                        shared.gradio['logits-default-previous'] = gr.Textbox(lines=23, label='Previous output', elem_classes=['textbox_logits', 'add_scrollbar'])
+
+
+def create_event_handlers():
+    shared.gradio['Generate-default'].click(
+        lambda x: x, gradio('textbox-default'), gradio('last_input-default')).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
+
+    shared.gradio['textbox-default'].submit(
+        lambda x: x, gradio('textbox-default'), gradio('last_input-default')).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
+
+    shared.gradio['markdown_render-default'].click(lambda x: x, gradio('output_textbox'), gradio('markdown-default'), queue=False)
+    shared.gradio['Continue-default'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        generate_reply_wrapper, [shared.gradio['output_textbox']] + gradio(inputs)[1:], gradio(outputs), show_progress=False).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
+
+    shared.gradio['Stop-default'].click(stop_everything_event, None, None, queue=False)
+    shared.gradio['prompt_menu-default'].change(load_prompt, gradio('prompt_menu-default'), gradio('textbox-default'), show_progress=False)
+    shared.gradio['save_prompt-default'].click(
+        lambda x: x, gradio('textbox-default'), gradio('save_contents')).then(
+        lambda: 'prompts/', None, gradio('save_root')).then(
+        lambda: utils.current_time() + '.txt', None, gradio('save_filename')).then(
+        lambda: gr.update(visible=True), None, gradio('file_saver'))
+
+    shared.gradio['delete_prompt-default'].click(
+        lambda: 'prompts/', None, gradio('delete_root')).then(
+        lambda x: x + '.txt', gradio('prompt_menu-default'), gradio('delete_filename')).then(
+        lambda: gr.update(visible=True), None, gradio('file_deleter'))
+
+    shared.gradio['textbox-default'].change(lambda x: f"<span>{count_tokens(x)}</span>", gradio('textbox-default'), gradio('token-counter-default'), show_progress=False)
+    shared.gradio['get_logits-default'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        logits.get_next_logits, gradio('textbox-default', 'interface_state', 'use_samplers-default', 'logits-default'), gradio('logits-default', 'logits-default-previous'), show_progress=False)
diff --git a/modules/ui_file_saving.py b/modules/ui_file_saving.py
new file mode 100644
index 00000000..b4674426
--- /dev/null
+++ b/modules/ui_file_saving.py
@@ -0,0 +1,110 @@
+import copy
+import json
+
+import gradio as gr
+
+from modules import chat, presets, shared, ui, ui_chat, utils
+from modules.utils import gradio
+
+
+def create_ui():
+
+    # Text file saver
+    with gr.Box(visible=False, elem_classes='file-saver') as shared.gradio['file_saver']:
+        shared.gradio['save_filename'] = gr.Textbox(lines=1, label='File name')
+        shared.gradio['save_root'] = gr.Textbox(lines=1, label='File folder', info='For reference. Unchangeable.', interactive=False)
+        shared.gradio['save_contents'] = gr.Textbox(lines=10, label='File contents')
+        with gr.Row():
+            shared.gradio['save_confirm'] = gr.Button('Save', elem_classes="small-button")
+            shared.gradio['save_cancel'] = gr.Button('Cancel', elem_classes="small-button")
+
+    # Text file deleter
+    with gr.Box(visible=False, elem_classes='file-saver') as shared.gradio['file_deleter']:
+        shared.gradio['delete_filename'] = gr.Textbox(lines=1, label='File name')
+        shared.gradio['delete_root'] = gr.Textbox(lines=1, label='File folder', info='For reference. Unchangeable.', interactive=False)
+        with gr.Row():
+            shared.gradio['delete_confirm'] = gr.Button('Delete', elem_classes="small-button", variant='stop')
+            shared.gradio['delete_cancel'] = gr.Button('Cancel', elem_classes="small-button")
+
+    # Character saver/deleter
+    with gr.Box(visible=False, elem_classes='file-saver') as shared.gradio['character_saver']:
+        shared.gradio['save_character_filename'] = gr.Textbox(lines=1, label='File name', info='The character will be saved to your characters/ folder with this base filename.')
+        with gr.Row():
+            shared.gradio['save_character_confirm'] = gr.Button('Save', elem_classes="small-button")
+            shared.gradio['save_character_cancel'] = gr.Button('Cancel', elem_classes="small-button")
+
+    with gr.Box(visible=False, elem_classes='file-saver') as shared.gradio['character_deleter']:
+        gr.Markdown('Confirm the character deletion?')
+        with gr.Row():
+            shared.gradio['delete_character_confirm'] = gr.Button('Delete', elem_classes="small-button", variant='stop')
+            shared.gradio['delete_character_cancel'] = gr.Button('Cancel', elem_classes="small-button")
+
+
+def create_event_handlers():
+    shared.gradio['save_confirm'].click(
+        lambda x, y, z: utils.save_file(x + y, z), gradio('save_root', 'save_filename', 'save_contents'), None).then(
+        lambda: gr.update(visible=False), None, gradio('file_saver'))
+
+    shared.gradio['delete_confirm'].click(
+        lambda x, y: utils.delete_file(x + y), gradio('delete_root', 'delete_filename'), None).then(
+        lambda: gr.update(visible=False), None, gradio('file_deleter'))
+
+    shared.gradio['delete_cancel'].click(lambda: gr.update(visible=False), None, gradio('file_deleter'))
+    shared.gradio['save_cancel'].click(lambda: gr.update(visible=False), None, gradio('file_saver'))
+
+    shared.gradio['save_character_confirm'].click(
+        chat.save_character, gradio('name2', 'greeting', 'context', 'character_picture', 'save_character_filename'), None).then(
+        lambda: gr.update(visible=False), None, gradio('character_saver')).then(
+        lambda x: gr.update(choices=utils.get_available_characters(), value=x), gradio('save_character_filename'), gradio('character_menu'))
+
+    shared.gradio['delete_character_confirm'].click(
+        chat.delete_character, gradio('character_menu'), None).then(
+        lambda: gr.update(visible=False), None, gradio('character_deleter')).then(
+        lambda: gr.update(choices=utils.get_available_characters(), value="None"), None, gradio('character_menu'))
+
+    shared.gradio['save_character_cancel'].click(lambda: gr.update(visible=False), None, gradio('character_saver'))
+    shared.gradio['delete_character_cancel'].click(lambda: gr.update(visible=False), None, gradio('character_deleter'))
+
+    shared.gradio['save_preset'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        presets.generate_preset_yaml, gradio('interface_state'), gradio('save_contents')).then(
+        lambda: 'presets/', None, gradio('save_root')).then(
+        lambda: 'My Preset.yaml', None, gradio('save_filename')).then(
+        lambda: gr.update(visible=True), None, gradio('file_saver'))
+
+    shared.gradio['delete_preset'].click(
+        lambda x: f'{x}.yaml', gradio('preset_menu'), gradio('delete_filename')).then(
+        lambda: 'presets/', None, gradio('delete_root')).then(
+        lambda: gr.update(visible=True), None, gradio('file_deleter'))
+
+    if not shared.args.multi_user:
+        shared.gradio['save_session'].click(
+            ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+            save_session, gradio('interface_state'), gradio('temporary_text')).then(
+            None, gradio('temporary_text'), None, _js=f"(contents) => {{{ui.save_files_js}; saveSession(contents)}}")
+
+        shared.gradio['load_session'].upload(
+            ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+            load_session, gradio('load_session', 'interface_state'), gradio('interface_state')).then(
+            ui.apply_interface_values, gradio('interface_state'), gradio(ui.list_interface_input_elements()), show_progress=False).then(
+            chat.redraw_html, gradio(ui_chat.reload_arr), gradio('display')).then(
+            None, None, None, _js='() => {alert("The session has been loaded.")}')
+
+
+def load_session(file, state):
+    decoded_file = file if type(file) == str else file.decode('utf-8')
+    data = json.loads(decoded_file)
+
+    if 'character_menu' in data and state.get('character_menu') != data.get('character_menu'):
+        shared.session_is_loading = True
+
+    state.update(data)
+    return state
+
+
+def save_session(state):
+    output = copy.deepcopy(state)
+    for key in ['prompt_menu-default', 'prompt_menu-notebook']:
+        del output[key]
+
+    return json.dumps(output, indent=4)
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
new file mode 100644
index 00000000..30f41590
--- /dev/null
+++ b/modules/ui_model_menu.py
@@ -0,0 +1,249 @@
+import importlib
+import math
+import re
+import traceback
+from functools import partial
+
+import gradio as gr
+import psutil
+import torch
+
+from modules import loaders, shared, ui, utils
+from modules.logging_colors import logger
+from modules.LoRA import add_lora_to_model
+from modules.models import load_model, unload_model
+from modules.models_settings import (
+    apply_model_settings_to_state,
+    save_model_settings,
+    update_model_parameters
+)
+from modules.utils import gradio
+
+
+def create_ui():
+    # Finding the default values for the GPU and CPU memories
+    total_mem = []
+    for i in range(torch.cuda.device_count()):
+        total_mem.append(math.floor(torch.cuda.get_device_properties(i).total_memory / (1024 * 1024)))
+
+    default_gpu_mem = []
+    if shared.args.gpu_memory is not None and len(shared.args.gpu_memory) > 0:
+        for i in shared.args.gpu_memory:
+            if 'mib' in i.lower():
+                default_gpu_mem.append(int(re.sub('[a-zA-Z ]', '', i)))
+            else:
+                default_gpu_mem.append(int(re.sub('[a-zA-Z ]', '', i)) * 1000)
+
+    while len(default_gpu_mem) < len(total_mem):
+        default_gpu_mem.append(0)
+
+    total_cpu_mem = math.floor(psutil.virtual_memory().total / (1024 * 1024))
+    if shared.args.cpu_memory is not None:
+        default_cpu_mem = re.sub('[a-zA-Z ]', '', shared.args.cpu_memory)
+    else:
+        default_cpu_mem = 0
+
+    with gr.Tab("Model", elem_id="model-tab"):
+        with gr.Row():
+            with gr.Column():
+                with gr.Row():
+                    with gr.Column():
+                        with gr.Row():
+                            shared.gradio['model_menu'] = gr.Dropdown(choices=utils.get_available_models(), value=shared.model_name, label='Model', elem_classes='slim-dropdown')
+                            ui.create_refresh_button(shared.gradio['model_menu'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button')
+                            shared.gradio['load_model'] = gr.Button("Load", visible=not shared.settings['autoload_model'], elem_classes='refresh-button')
+                            shared.gradio['unload_model'] = gr.Button("Unload", elem_classes='refresh-button')
+                            shared.gradio['reload_model'] = gr.Button("Reload", elem_classes='refresh-button')
+                            shared.gradio['save_model_settings'] = gr.Button("Save settings", elem_classes='refresh-button')
+
+                    with gr.Column():
+                        with gr.Row():
+                            shared.gradio['lora_menu'] = gr.Dropdown(multiselect=True, choices=utils.get_available_loras(), value=shared.lora_names, label='LoRA(s)', elem_classes='slim-dropdown')
+                            ui.create_refresh_button(shared.gradio['lora_menu'], lambda: None, lambda: {'choices': utils.get_available_loras(), 'value': shared.lora_names}, 'refresh-button')
+                            shared.gradio['lora_menu_apply'] = gr.Button(value='Apply LoRAs', elem_classes='refresh-button')
+
+        with gr.Row():
+            with gr.Column():
+                shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=loaders.loaders_and_params.keys(), value=None)
+                with gr.Box():
+                    with gr.Row():
+                        with gr.Column():
+                            for i in range(len(total_mem)):
+                                shared.gradio[f'gpu_memory_{i}'] = gr.Slider(label=f"gpu-memory in MiB for device :{i}", maximum=total_mem[i], value=default_gpu_mem[i])
+
+                            shared.gradio['cpu_memory'] = gr.Slider(label="cpu-memory in MiB", maximum=total_cpu_mem, value=default_cpu_mem)
+                            shared.gradio['transformers_info'] = gr.Markdown('load-in-4bit params:')
+                            shared.gradio['compute_dtype'] = gr.Dropdown(label="compute_dtype", choices=["bfloat16", "float16", "float32"], value=shared.args.compute_dtype)
+                            shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type)
+
+                            shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=128, value=shared.args.n_gpu_layers)
+                            shared.gradio['n_ctx'] = gr.Slider(minimum=0, maximum=16384, step=256, label="n_ctx", value=shared.args.n_ctx)
+                            shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=32, value=shared.args.threads)
+                            shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, value=shared.args.n_batch)
+                            shared.gradio['n_gqa'] = gr.Slider(minimum=0, maximum=16, step=1, label="n_gqa", value=shared.args.n_gqa, info='GGML only (not used by GGUF): Grouped-Query Attention. Must be 8 for llama-2 70b.')
+                            shared.gradio['rms_norm_eps'] = gr.Slider(minimum=0, maximum=1e-5, step=1e-6, label="rms_norm_eps", value=shared.args.rms_norm_eps, info='GGML only (not used by GGUF): 5e-6 is a good value for llama-2 models.')
+
+                            shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=str(shared.args.wbits) if shared.args.wbits > 0 else "None")
+                            shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=str(shared.args.groupsize) if shared.args.groupsize > 0 else "None")
+                            shared.gradio['model_type'] = gr.Dropdown(label="model_type", choices=["None"], value=shared.args.model_type or "None")
+                            shared.gradio['pre_layer'] = gr.Slider(label="pre_layer", minimum=0, maximum=100, value=shared.args.pre_layer[0] if shared.args.pre_layer is not None else 0)
+                            shared.gradio['autogptq_info'] = gr.Markdown('* ExLlama_HF is recommended over AutoGPTQ for models derived from LLaMA.')
+                            shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
+                            shared.gradio['max_seq_len'] = gr.Slider(label='max_seq_len', minimum=0, maximum=16384, step=256, info='Maximum sequence length.', value=shared.args.max_seq_len)
+                            shared.gradio['alpha_value'] = gr.Slider(label='alpha_value', minimum=1, maximum=8, step=0.1, info='Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.', value=shared.args.alpha_value)
+                            shared.gradio['rope_freq_base'] = gr.Slider(label='rope_freq_base', minimum=0, maximum=1000000, step=1000, info='If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63)', value=shared.args.rope_freq_base)
+                            shared.gradio['compress_pos_emb'] = gr.Slider(label='compress_pos_emb', minimum=1, maximum=8, step=1, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.', value=shared.args.compress_pos_emb)
+
+                        with gr.Column():
+                            shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)
+                            shared.gradio['no_inject_fused_attention'] = gr.Checkbox(label="no_inject_fused_attention", value=shared.args.no_inject_fused_attention, info='Disable fused attention. Fused attention improves inference performance but uses more VRAM. Disable if running low on VRAM.')
+                            shared.gradio['no_inject_fused_mlp'] = gr.Checkbox(label="no_inject_fused_mlp", value=shared.args.no_inject_fused_mlp, info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.')
+                            shared.gradio['no_use_cuda_fp16'] = gr.Checkbox(label="no_use_cuda_fp16", value=shared.args.no_use_cuda_fp16, info='This can make models faster on some systems.')
+                            shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.')
+                            shared.gradio['disable_exllama'] = gr.Checkbox(label="disable_exllama", value=shared.args.disable_exllama, info='Disable ExLlama kernel, which can improve inference speed on some systems.')
+                            shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu)
+                            shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
+                            shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16)
+                            shared.gradio['auto_devices'] = gr.Checkbox(label="auto-devices", value=shared.args.auto_devices)
+                            shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)
+                            shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
+                            shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant)
+                            shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap)
+                            shared.gradio['low_vram'] = gr.Checkbox(label="low-vram", value=shared.args.low_vram)
+                            shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
+                            shared.gradio['mul_mat_q'] = gr.Checkbox(label="mul_mat_q", value=shared.args.mul_mat_q)
+                            shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Create an additional cache for CFG negative prompts.')
+                            shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17')
+                            shared.gradio['llama_cpp_seed'] = gr.Number(label='Seed (0 for random)', value=shared.args.llama_cpp_seed)
+                            shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Make sure to inspect the .py files inside the model folder before loading it with this option enabled.')
+                            shared.gradio['gptq_for_llama_info'] = gr.Markdown('GPTQ-for-LLaMa support is currently only kept for compatibility with older GPUs. AutoGPTQ or ExLlama is preferred when compatible. GPTQ-for-LLaMa is installed by default with the webui on supported systems. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1).')
+                            shared.gradio['exllama_info'] = gr.Markdown('For more information, consult the [docs](https://github.com/oobabooga/text-generation-webui/blob/main/docs/ExLlama.md).')
+                            shared.gradio['exllama_HF_info'] = gr.Markdown('ExLlama_HF is a wrapper that lets you use ExLlama like a Transformers model, which means it can use the Transformers samplers. It\'s a bit slower than the regular ExLlama.')
+                            shared.gradio['llamacpp_HF_info'] = gr.Markdown('llamacpp_HF is a wrapper that lets you use llama.cpp like a Transformers model, which means it can use the Transformers samplers. To use it, make sure to first download oobabooga/llama-tokenizer under "Download custom model or LoRA".')
+
+            with gr.Column():
+                with gr.Row():
+                    shared.gradio['autoload_model'] = gr.Checkbox(value=shared.settings['autoload_model'], label='Autoload the model', info='Whether to load the model as soon as it is selected in the Model dropdown.')
+
+                shared.gradio['custom_model_menu'] = gr.Textbox(label="Download custom model or LoRA", info="Enter the Hugging Face username/model path, for instance: facebook/galactica-125m. To specify a branch, add it at the end after a \":\" character like this: facebook/galactica-125m:main")
+                shared.gradio['download_model_button'] = gr.Button("Download")
+
+                with gr.Row():
+                    shared.gradio['model_status'] = gr.Markdown('No model is loaded' if shared.model_name == 'None' else 'Ready')
+
+
+def create_event_handlers():
+    shared.gradio['loader'].change(
+        loaders.make_loader_params_visible, gradio('loader'), gradio(loaders.get_all_params())).then(
+        lambda value: gr.update(choices=loaders.get_model_types(value)), gradio('loader'), gradio('model_type'))
+
+    # In this event handler, the interface state is read and updated
+    # with the model defaults (if any), and then the model is loaded
+    # unless "autoload_model" is unchecked
+    shared.gradio['model_menu'].change(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        apply_model_settings_to_state, gradio('model_menu', 'interface_state'), gradio('interface_state')).then(
+        ui.apply_interface_values, gradio('interface_state'), gradio(ui.list_interface_input_elements()), show_progress=False).then(
+        update_model_parameters, gradio('interface_state'), None).then(
+        load_model_wrapper, gradio('model_menu', 'loader', 'autoload_model'), gradio('model_status'), show_progress=False).success(
+        update_truncation_length, gradio('truncation_length', 'interface_state'), gradio('truncation_length'))
+
+    shared.gradio['load_model'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        update_model_parameters, gradio('interface_state'), None).then(
+        partial(load_model_wrapper, autoload=True), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=False).success(
+        update_truncation_length, gradio('truncation_length', 'interface_state'), gradio('truncation_length'))
+
+    shared.gradio['unload_model'].click(
+        unload_model, None, None).then(
+        lambda: "Model unloaded", None, gradio('model_status'))
+
+    shared.gradio['reload_model'].click(
+        unload_model, None, None).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        update_model_parameters, gradio('interface_state'), None).then(
+        partial(load_model_wrapper, autoload=True), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=False).success(
+        update_truncation_length, gradio('truncation_length', 'interface_state'), gradio('truncation_length'))
+
+    shared.gradio['save_model_settings'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        save_model_settings, gradio('model_menu', 'interface_state'), gradio('model_status'), show_progress=False)
+
+    shared.gradio['lora_menu_apply'].click(load_lora_wrapper, gradio('lora_menu'), gradio('model_status'), show_progress=False)
+    shared.gradio['download_model_button'].click(download_model_wrapper, gradio('custom_model_menu'), gradio('model_status'), show_progress=True)
+    shared.gradio['autoload_model'].change(lambda x: gr.update(visible=not x), gradio('autoload_model'), gradio('load_model'))
+
+
+def load_model_wrapper(selected_model, loader, autoload=False):
+    if not autoload:
+        yield f"The settings for {selected_model} have been updated.\nClick on \"Load\" to load it."
+        return
+
+    if selected_model == 'None':
+        yield "No model selected"
+    else:
+        try:
+            yield f"Loading {selected_model}..."
+            shared.model_name = selected_model
+            unload_model()
+            if selected_model != '':
+                shared.model, shared.tokenizer = load_model(shared.model_name, loader)
+
+            if shared.model is not None:
+                yield f"Successfully loaded {selected_model}"
+            else:
+                yield f"Failed to load {selected_model}."
+        except:
+            exc = traceback.format_exc()
+            logger.error('Failed to load the model.')
+            print(exc)
+            yield exc.replace('\n', '\n\n')
+
+
+def load_lora_wrapper(selected_loras):
+    yield ("Applying the following LoRAs to {}:\n\n{}".format(shared.model_name, '\n'.join(selected_loras)))
+    add_lora_to_model(selected_loras)
+    yield ("Successfuly applied the LoRAs")
+
+
+def download_model_wrapper(repo_id, progress=gr.Progress()):
+    try:
+        downloader_module = importlib.import_module("download-model")
+        downloader = downloader_module.ModelDownloader()
+        repo_id_parts = repo_id.split(":")
+        model = repo_id_parts[0] if len(repo_id_parts) > 0 else repo_id
+        branch = repo_id_parts[1] if len(repo_id_parts) > 1 else "main"
+        check = False
+
+        progress(0.0)
+        yield ("Cleaning up the model/branch names")
+        model, branch = downloader.sanitize_model_and_branch_names(model, branch)
+
+        yield ("Getting the download links from Hugging Face")
+        links, sha256, is_lora = downloader.get_download_links_from_huggingface(model, branch, text_only=False)
+
+        yield ("Getting the output folder")
+        base_folder = shared.args.lora_dir if is_lora else shared.args.model_dir
+        output_folder = downloader.get_output_folder(model, branch, is_lora, base_folder=base_folder)
+
+        if check:
+            progress(0.5)
+            yield ("Checking previously downloaded files")
+            downloader.check_model_files(model, branch, links, sha256, output_folder)
+            progress(1.0)
+        else:
+            yield (f"Downloading files to {output_folder}")
+            downloader.download_model_files(model, branch, links, sha256, output_folder, progress_bar=progress, threads=1)
+            yield ("Done!")
+    except:
+        progress(1.0)
+        yield traceback.format_exc().replace('\n', '\n\n')
+
+
+def update_truncation_length(current_length, state):
+    if state['loader'] in ['ExLlama', 'ExLlama_HF']:
+        return state['max_seq_len']
+    elif state['loader'] in ['llama.cpp', 'llamacpp_HF', 'ctransformers']:
+        return state['n_ctx']
+    else:
+        return current_length
diff --git a/modules/ui_notebook.py b/modules/ui_notebook.py
new file mode 100644
index 00000000..e2aeb860
--- /dev/null
+++ b/modules/ui_notebook.py
@@ -0,0 +1,98 @@
+import gradio as gr
+
+from modules import logits, shared, ui, utils
+from modules.prompts import count_tokens, load_prompt
+from modules.text_generation import (
+    generate_reply_wrapper,
+    stop_everything_event
+)
+from modules.utils import gradio
+
+inputs = ('textbox-notebook', 'interface_state')
+outputs = ('textbox-notebook', 'html-notebook')
+
+
+def create_ui():
+    with gr.Tab('Notebook', elem_id='notebook-tab'):
+        shared.gradio['last_input-notebook'] = gr.State('')
+        with gr.Row():
+            with gr.Column(scale=4):
+                with gr.Tab('Raw'):
+                    with gr.Row():
+                        shared.gradio['textbox-notebook'] = gr.Textbox(value='', elem_classes=['textbox', 'add_scrollbar'], lines=27)
+                        shared.gradio['token-counter-notebook'] = gr.HTML(value="<span>0</span>", elem_classes=["token-counter"])
+
+                with gr.Tab('Markdown'):
+                    shared.gradio['markdown_render-notebook'] = gr.Button('Render')
+                    shared.gradio['markdown-notebook'] = gr.Markdown()
+
+                with gr.Tab('HTML'):
+                    shared.gradio['html-notebook'] = gr.HTML()
+
+                with gr.Tab('Logits'):
+                    with gr.Row():
+                        with gr.Column(scale=10):
+                            shared.gradio['get_logits-notebook'] = gr.Button('Get next token probabilities')
+                        with gr.Column(scale=1):
+                            shared.gradio['use_samplers-notebook'] = gr.Checkbox(label='Use samplers', value=True, elem_classes=['no-background'])
+
+                    with gr.Row():
+                        shared.gradio['logits-notebook'] = gr.Textbox(lines=23, label='Output', elem_classes=['textbox_logits_notebook', 'add_scrollbar'])
+                        shared.gradio['logits-notebook-previous'] = gr.Textbox(lines=23, label='Previous output', elem_classes=['textbox_logits_notebook', 'add_scrollbar'])
+
+                with gr.Row():
+                    shared.gradio['Generate-notebook'] = gr.Button('Generate', variant='primary', elem_classes='small-button')
+                    shared.gradio['Stop-notebook'] = gr.Button('Stop', elem_classes='small-button', elem_id='stop')
+                    shared.gradio['Undo'] = gr.Button('Undo', elem_classes='small-button')
+                    shared.gradio['Regenerate-notebook'] = gr.Button('Regenerate', elem_classes='small-button')
+
+            with gr.Column(scale=1):
+                gr.HTML('<div style="padding-bottom: 13px"></div>')
+                with gr.Row():
+                    shared.gradio['prompt_menu-notebook'] = gr.Dropdown(choices=utils.get_available_prompts(), value='None', label='Prompt', elem_classes='slim-dropdown')
+                    ui.create_refresh_button(shared.gradio['prompt_menu-notebook'], lambda: None, lambda: {'choices': utils.get_available_prompts()}, ['refresh-button', 'refresh-button-small'])
+                    shared.gradio['save_prompt-notebook'] = gr.Button('💾', elem_classes=['refresh-button', 'refresh-button-small'])
+                    shared.gradio['delete_prompt-notebook'] = gr.Button('🗑️', elem_classes=['refresh-button', 'refresh-button-small'])
+
+
+def create_event_handlers():
+    shared.gradio['Generate-notebook'].click(
+        lambda x: x, gradio('textbox-notebook'), gradio('last_input-notebook')).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
+
+    shared.gradio['textbox-notebook'].submit(
+        lambda x: x, gradio('textbox-notebook'), gradio('last_input-notebook')).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
+
+    shared.gradio['Undo'].click(lambda x: x, gradio('last_input-notebook'), gradio('textbox-notebook'), show_progress=False)
+    shared.gradio['markdown_render-notebook'].click(lambda x: x, gradio('textbox-notebook'), gradio('markdown-notebook'), queue=False)
+    shared.gradio['Regenerate-notebook'].click(
+        lambda x: x, gradio('last_input-notebook'), gradio('textbox-notebook'), show_progress=False).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
+
+    shared.gradio['Stop-notebook'].click(stop_everything_event, None, None, queue=False)
+    shared.gradio['prompt_menu-notebook'].change(load_prompt, gradio('prompt_menu-notebook'), gradio('textbox-notebook'), show_progress=False)
+    shared.gradio['save_prompt-notebook'].click(
+        lambda x: x, gradio('textbox-notebook'), gradio('save_contents')).then(
+        lambda: 'prompts/', None, gradio('save_root')).then(
+        lambda: utils.current_time() + '.txt', None, gradio('save_filename')).then(
+        lambda: gr.update(visible=True), None, gradio('file_saver'))
+
+    shared.gradio['delete_prompt-notebook'].click(
+        lambda: 'prompts/', None, gradio('delete_root')).then(
+        lambda x: x + '.txt', gradio('prompt_menu-notebook'), gradio('delete_filename')).then(
+        lambda: gr.update(visible=True), None, gradio('file_deleter'))
+
+    shared.gradio['textbox-notebook'].input(lambda x: f"<span>{count_tokens(x)}</span>", gradio('textbox-notebook'), gradio('token-counter-notebook'), show_progress=False)
+    shared.gradio['get_logits-notebook'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        logits.get_next_logits, gradio('textbox-notebook', 'interface_state', 'use_samplers-notebook', 'logits-notebook'), gradio('logits-notebook', 'logits-notebook-previous'), show_progress=False)
diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py
new file mode 100644
index 00000000..b5ce5ac9
--- /dev/null
+++ b/modules/ui_parameters.py
@@ -0,0 +1,140 @@
+import gradio as gr
+
+from modules import loaders, presets, shared, ui, ui_chat, utils
+from modules.utils import gradio
+
+
+def create_ui(default_preset):
+    generate_params = presets.load_preset(default_preset)
+    with gr.Tab("Parameters", elem_id="parameters"):
+        with gr.Tab("Generation"):
+            with gr.Row():
+                with gr.Column():
+                    with gr.Row():
+                        shared.gradio['preset_menu'] = gr.Dropdown(choices=utils.get_available_presets(), value=default_preset, label='Preset', elem_classes='slim-dropdown')
+                        ui.create_refresh_button(shared.gradio['preset_menu'], lambda: None, lambda: {'choices': utils.get_available_presets()}, 'refresh-button')
+                        shared.gradio['save_preset'] = gr.Button('💾', elem_classes='refresh-button')
+                        shared.gradio['delete_preset'] = gr.Button('🗑️', elem_classes='refresh-button')
+
+                with gr.Column():
+                    shared.gradio['filter_by_loader'] = gr.Dropdown(label="Filter by loader", choices=["All"] + list(loaders.loaders_and_params.keys()), value="All", elem_classes='slim-dropdown')
+
+            with gr.Row():
+                with gr.Column():
+                    with gr.Box():
+                        with gr.Row():
+                            with gr.Column():
+                                shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], step=1, label='max_new_tokens', value=shared.settings['max_new_tokens'])
+                                shared.gradio['temperature'] = gr.Slider(0.01, 1.99, value=generate_params['temperature'], step=0.01, label='temperature')
+                                shared.gradio['top_p'] = gr.Slider(0.0, 1.0, value=generate_params['top_p'], step=0.01, label='top_p')
+                                shared.gradio['top_k'] = gr.Slider(0, 200, value=generate_params['top_k'], step=1, label='top_k')
+                                shared.gradio['typical_p'] = gr.Slider(0.0, 1.0, value=generate_params['typical_p'], step=0.01, label='typical_p')
+                                shared.gradio['epsilon_cutoff'] = gr.Slider(0, 9, value=generate_params['epsilon_cutoff'], step=0.01, label='epsilon_cutoff')
+                                shared.gradio['eta_cutoff'] = gr.Slider(0, 20, value=generate_params['eta_cutoff'], step=0.01, label='eta_cutoff')
+                                shared.gradio['tfs'] = gr.Slider(0.0, 1.0, value=generate_params['tfs'], step=0.01, label='tfs')
+                                shared.gradio['top_a'] = gr.Slider(0.0, 1.0, value=generate_params['top_a'], step=0.01, label='top_a')
+
+                            with gr.Column():
+                                shared.gradio['repetition_penalty'] = gr.Slider(1.0, 1.5, value=generate_params['repetition_penalty'], step=0.01, label='repetition_penalty')
+                                shared.gradio['repetition_penalty_range'] = gr.Slider(0, 4096, step=64, value=generate_params['repetition_penalty_range'], label='repetition_penalty_range')
+                                shared.gradio['encoder_repetition_penalty'] = gr.Slider(0.8, 1.5, value=generate_params['encoder_repetition_penalty'], step=0.01, label='encoder_repetition_penalty')
+                                shared.gradio['no_repeat_ngram_size'] = gr.Slider(0, 20, step=1, value=generate_params['no_repeat_ngram_size'], label='no_repeat_ngram_size')
+                                shared.gradio['min_length'] = gr.Slider(0, 2000, step=1, value=generate_params['min_length'], label='min_length')
+                                shared.gradio['seed'] = gr.Number(value=shared.settings['seed'], label='Seed (-1 for random)')
+                                shared.gradio['do_sample'] = gr.Checkbox(value=generate_params['do_sample'], label='do_sample')
+
+                    with gr.Accordion("Learn more", open=False):
+                        gr.Markdown("""
+
+            For a technical description of the parameters, the [transformers documentation](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig) is a good reference.
+
+            The best presets, according to the [Preset Arena](https://github.com/oobabooga/oobabooga.github.io/blob/main/arena/results.md) experiment, are:
+
+            * Instruction following:
+                1) Divine Intellect
+                2) Big O
+                3) simple-1
+                4) Space Alien
+                5) StarChat
+                6) Titanic
+                7) tfs-with-top-a
+                8) Asterism
+                9) Contrastive Search
+
+            * Chat:
+                1) Midnight Enigma
+                2) Yara
+                3) Shortwave
+
+            ### Temperature
+            Primary factor to control randomness of outputs. 0 = deterministic (only the most likely token is used). Higher value = more randomness.
+            ### top_p
+            If not set to 1, select tokens with probabilities adding up to less than this number. Higher value = higher range of possible random results.
+            ### top_k
+            Similar to top_p, but select instead only the top_k most likely tokens. Higher value = higher range of possible random results.
+            ### typical_p
+            If not set to 1, select only tokens that are at least this much more likely to appear than random tokens, given the prior text.
+            ### epsilon_cutoff
+            In units of 1e-4; a reasonable value is 3. This sets a probability floor below which tokens are excluded from being sampled. Should be used with top_p, top_k, and eta_cutoff set to 0.
+            ### eta_cutoff
+            In units of 1e-4; a reasonable value is 3. Should be used with top_p, top_k, and epsilon_cutoff set to 0.
+            ### repetition_penalty
+            Exponential penalty factor for repeating prior tokens. 1 means no penalty, higher value = less repetition, lower value = more repetition.
+            ### repetition_penalty_range
+            The number of most recent tokens to consider for repetition penalty. 0 makes all tokens be used.
+            ### encoder_repetition_penalty
+            Also known as the "Hallucinations filter". Used to penalize tokens that are *not* in the prior text. Higher value = more likely to stay in context, lower value = more likely to diverge.
+            ### no_repeat_ngram_size
+            If not set to 0, specifies the length of token sets that are completely blocked from repeating at all. Higher values = blocks larger phrases, lower values = blocks words or letters from repeating. Only 0 or high values are a good idea in most cases.
+            ### min_length
+            Minimum generation length in tokens.
+            ### penalty_alpha
+            Contrastive Search is enabled by setting this to greater than zero and unchecking "do_sample". It should be used with a low value of top_k, for instance, top_k = 4.
+
+                        """, elem_classes="markdown")
+
+                with gr.Column():
+                    with gr.Box():
+                        with gr.Row():
+                            with gr.Column():
+                                shared.gradio['guidance_scale'] = gr.Slider(-0.5, 2.5, step=0.05, value=generate_params['guidance_scale'], label='guidance_scale', info='For CFG. 1.5 is a good value.')
+                                shared.gradio['negative_prompt'] = gr.Textbox(value=shared.settings['negative_prompt'], label='Negative prompt', lines=3, elem_classes=['add_scrollbar'])
+                                shared.gradio['mirostat_mode'] = gr.Slider(0, 2, step=1, value=generate_params['mirostat_mode'], label='mirostat_mode', info='mode=1 is for llama.cpp only.')
+                                shared.gradio['mirostat_tau'] = gr.Slider(0, 10, step=0.01, value=generate_params['mirostat_tau'], label='mirostat_tau')
+                                shared.gradio['mirostat_eta'] = gr.Slider(0, 1, step=0.01, value=generate_params['mirostat_eta'], label='mirostat_eta')
+
+                            with gr.Column():
+                                shared.gradio['penalty_alpha'] = gr.Slider(0, 5, value=generate_params['penalty_alpha'], label='penalty_alpha', info='For Contrastive Search. do_sample must be unchecked.')
+
+                                shared.gradio['num_beams'] = gr.Slider(1, 20, step=1, value=generate_params['num_beams'], label='num_beams', info='For Beam Search, along with length_penalty and early_stopping.')
+                                shared.gradio['length_penalty'] = gr.Slider(-5, 5, value=generate_params['length_penalty'], label='length_penalty')
+                                shared.gradio['early_stopping'] = gr.Checkbox(value=generate_params['early_stopping'], label='early_stopping')
+
+                    with gr.Box():
+                        with gr.Row():
+                            with gr.Column():
+                                shared.gradio['truncation_length'] = gr.Slider(value=get_truncation_length(), minimum=shared.settings['truncation_length_min'], maximum=shared.settings['truncation_length_max'], step=256, label='Truncate the prompt up to this length', info='The leftmost tokens are removed if the prompt exceeds this length. Most models require this to be at most 2048.')
+                                shared.gradio['custom_stopping_strings'] = gr.Textbox(lines=1, value=shared.settings["custom_stopping_strings"] or None, label='Custom stopping strings', info='In addition to the defaults. Written between "" and separated by commas.', placeholder='"\\n", "\\nYou:"')
+                            with gr.Column():
+                                shared.gradio['auto_max_new_tokens'] = gr.Checkbox(value=shared.settings['auto_max_new_tokens'], label='auto_max_new_tokens', info='Expand max_new_tokens to the available context length.')
+                                shared.gradio['ban_eos_token'] = gr.Checkbox(value=shared.settings['ban_eos_token'], label='Ban the eos_token', info='Forces the model to never end the generation prematurely.')
+                                shared.gradio['add_bos_token'] = gr.Checkbox(value=shared.settings['add_bos_token'], label='Add the bos_token to the beginning of prompts', info='Disabling this can make the replies more creative.')
+
+                                shared.gradio['skip_special_tokens'] = gr.Checkbox(value=shared.settings['skip_special_tokens'], label='Skip special tokens', info='Some specific models need this unset.')
+                                shared.gradio['stream'] = gr.Checkbox(value=shared.settings['stream'], label='Activate text streaming')
+
+        ui_chat.create_chat_settings_ui()
+
+
+def create_event_handlers():
+    shared.gradio['filter_by_loader'].change(loaders.blacklist_samplers, gradio('filter_by_loader'), gradio(loaders.list_all_samplers()), show_progress=False)
+    shared.gradio['preset_menu'].change(presets.load_preset_for_ui, gradio('preset_menu', 'interface_state'), gradio('interface_state') + gradio(presets.presets_params()))
+
+
+def get_truncation_length():
+    if shared.args.max_seq_len != shared.args_defaults.max_seq_len:
+        return shared.args.max_seq_len
+    if shared.args.n_ctx != shared.args_defaults.n_ctx:
+        return shared.args.n_ctx
+    else:
+        return shared.settings['truncation_length']
diff --git a/modules/ui_session.py b/modules/ui_session.py
new file mode 100644
index 00000000..537a31f2
--- /dev/null
+++ b/modules/ui_session.py
@@ -0,0 +1,73 @@
+import gradio as gr
+
+from modules import shared, ui, utils
+from modules.github import clone_or_pull_repository
+from modules.utils import gradio
+
+
+def create_ui():
+    with gr.Tab("Session", elem_id="session-tab"):
+        with gr.Row():
+            with gr.Column():
+                shared.gradio['reset_interface'] = gr.Button("Apply flags/extensions and restart")
+                with gr.Row():
+                    shared.gradio['toggle_dark_mode'] = gr.Button('Toggle 💡')
+                    shared.gradio['save_settings'] = gr.Button('Save UI defaults to settings.yaml')
+
+                with gr.Row():
+                    with gr.Column():
+                        shared.gradio['extensions_menu'] = gr.CheckboxGroup(choices=utils.get_available_extensions(), value=shared.args.extensions, label="Available extensions", info='Note that some of these extensions may require manually installing Python requirements through the command: pip install -r extensions/extension_name/requirements.txt', elem_classes='checkboxgroup-table')
+
+                    with gr.Column():
+                        shared.gradio['bool_menu'] = gr.CheckboxGroup(choices=get_boolean_arguments(), value=get_boolean_arguments(active=True), label="Boolean command-line flags", elem_classes='checkboxgroup-table')
+
+            with gr.Column():
+                if not shared.args.multi_user:
+                    shared.gradio['save_session'] = gr.Button('Save session')
+                    shared.gradio['load_session'] = gr.File(type='binary', file_types=['.json'], label="Upload Session JSON")
+
+                extension_name = gr.Textbox(lines=1, label='Install or update an extension', info='Enter the GitHub URL below and press Enter. For a list of extensions, see: https://github.com/oobabooga/text-generation-webui-extensions ⚠️  WARNING ⚠️ : extensions can execute arbitrary code. Make sure to inspect their source code before activating them.')
+                extension_status = gr.Markdown()
+
+        extension_name.submit(
+            clone_or_pull_repository, extension_name, extension_status, show_progress=False).then(
+            lambda: gr.update(choices=utils.get_available_extensions(), value=shared.args.extensions), None, gradio('extensions_menu'))
+
+        # Reset interface event
+        shared.gradio['reset_interface'].click(
+            set_interface_arguments, gradio('extensions_menu', 'bool_menu'), None).then(
+            lambda: None, None, None, _js='() => {document.body.innerHTML=\'<h1 style="font-family:monospace;padding-top:20%;margin:0;height:100vh;color:lightgray;text-align:center;background:var(--body-background-fill)">Reloading...</h1>\'; setTimeout(function(){location.reload()},2500); return []}')
+
+        shared.gradio['toggle_dark_mode'].click(lambda: None, None, None, _js='() => {document.getElementsByTagName("body")[0].classList.toggle("dark")}')
+        shared.gradio['save_settings'].click(
+            ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+            ui.save_settings, gradio('interface_state', 'preset_menu', 'instruction_template', 'extensions_menu', 'show_controls'), gradio('save_contents')).then(
+            lambda: './', None, gradio('save_root')).then(
+            lambda: 'settings.yaml', None, gradio('save_filename')).then(
+            lambda: gr.update(visible=True), None, gradio('file_saver'))
+
+
+def set_interface_arguments(extensions, bool_active):
+    shared.args.extensions = extensions
+
+    bool_list = get_boolean_arguments()
+
+    for k in bool_list:
+        setattr(shared.args, k, False)
+    for k in bool_active:
+        setattr(shared.args, k, True)
+
+    shared.need_restart = True
+
+
+def get_boolean_arguments(active=False):
+    exclude = ["default", "notebook", "chat"]
+
+    cmd_list = vars(shared.args)
+    bool_list = sorted([k for k in cmd_list if type(cmd_list[k]) is bool and k not in exclude + ui.list_model_elements()])
+    bool_active = [k for k in bool_list if vars(shared.args)[k]]
+
+    if active:
+        return bool_active
+    else:
+        return bool_list
diff --git a/modules/utils.py b/modules/utils.py
index 9ae5dc86..15dbd9dd 100644
--- a/modules/utils.py
+++ b/modules/utils.py
@@ -2,6 +2,7 @@ import os
 import re
 from datetime import datetime
 from pathlib import Path
+from typing import Union
 
 from modules import shared
 from modules.logging_colors import logger
@@ -9,7 +10,7 @@ from modules.logging_colors import logger
 
 # Helper function to get multiple values from shared.gradio
 def gradio(*keys):
-    if len(keys) == 1 and type(keys[0]) is list:
+    if len(keys) == 1 and type(keys[0]) in [list, tuple]:
         keys = keys[0]
 
     return [shared.gradio[k] for k in keys]
@@ -71,7 +72,12 @@ def natural_keys(text):
 
 
 def get_available_models():
-    return sorted([re.sub('.pth$', '', item.name) for item in list(Path(f'{shared.args.model_dir}/').glob('*')) if not item.name.endswith(('.txt', '-np', '.pt', '.json', '.yaml'))], key=natural_keys)
+    model_list = []
+    for item in list(Path(f'{shared.args.model_dir}/').glob('*')):
+        if not item.name.endswith(('.txt', '-np', '.pt', '.json', '.yaml', '.py')) and 'llama-tokenizer' not in item.name:
+            model_list.append(re.sub('.pth$', '', item.name))
+
+    return sorted(model_list, key=natural_keys)
 
 
 def get_available_presets():
@@ -83,18 +89,17 @@ def get_available_prompts():
     files = set((k.stem for k in Path('prompts').glob('*.txt')))
     prompts += sorted([k for k in files if re.match('^[0-9]', k)], key=natural_keys, reverse=True)
     prompts += sorted([k for k in files if re.match('^[^0-9]', k)], key=natural_keys)
-    prompts += ['Instruct-' + k for k in get_available_instruction_templates() if k != 'None']
     prompts += ['None']
     return prompts
 
 
 def get_available_characters():
     paths = (x for x in Path('characters').iterdir() if x.suffix in ('.json', '.yaml', '.yml'))
-    return ['None'] + sorted(set((k.stem for k in paths if k.stem != "instruction-following")), key=natural_keys)
+    return ['None'] + sorted(set((k.stem for k in paths)), key=natural_keys)
 
 
 def get_available_instruction_templates():
-    path = "characters/instruction-following"
+    path = "instruction-templates"
     paths = []
     if os.path.exists(path):
         paths = (x for x in Path(path).iterdir() if x.suffix in ('.json', '.yaml', '.yml'))
@@ -122,6 +127,13 @@ def get_available_chat_styles():
     return sorted(set(('-'.join(k.stem.split('-')[1:]) for k in Path('css').glob('chat_style*.css'))), key=natural_keys)
 
 
-def get_available_sessions():
-    items = sorted(set(k.stem for k in Path('logs').glob(f'session_{shared.get_mode()}*')), key=natural_keys, reverse=True)
-    return [item for item in items if 'autosave' in item] + [item for item in items if 'autosave' not in item]
+def is_gguf(path: Union[str, Path]) -> bool:
+    '''
+    Determines if a llama.cpp model is in GGUF format
+    Copied from ctransformers utils.py
+    '''
+    path = str(Path(path).resolve())
+    with open(path, "rb") as f:
+        magic = f.read(4)
+
+    return magic == "GGUF".encode()
diff --git a/requirements.txt b/requirements.txt
index 9486f808..7311370b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,33 +1,56 @@
-accelerate==0.21.0
-colorama
-datasets
-einops
+aiofiles==23.1.0
 fastapi==0.95.2
 gradio_client==0.2.5
 gradio==3.33.1
+
+accelerate==0.21.*
+colorama
+datasets
+einops
 markdown
-numpy
+numpy==1.24
 pandas
+peft==0.5.*
 Pillow>=9.5.0
 pyyaml
 requests
-safetensors==0.3.1
+safetensors==0.3.2
+transformers==4.32.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.31.*
 tqdm
 wandb
-git+https://github.com/huggingface/peft@96c0277a1b9a381b10ab34dbf84917f9b3b992e6
-bitsandbytes==0.41.0; platform_system != "Windows"
-https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.0-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.3.0/auto_gptq-0.3.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
-https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.3.0/auto_gptq-0.3.0+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/jllllll/exllama/releases/download/0.0.9/exllama-0.0.9+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
-https://github.com/jllllll/exllama/releases/download/0.0.9/exllama-0.0.9+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+
+# bitsandbytes
+bitsandbytes==0.41.1; platform_system != "Windows"
+https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.1-py3-none-win_amd64.whl; platform_system == "Windows"
+
+# AutoGPTQ
+https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
+https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+
+# ExLlama
+https://github.com/jllllll/exllama/releases/download/0.0.10/exllama-0.0.10+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
+https://github.com/jllllll/exllama/releases/download/0.0.10/exllama-0.0.10+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+
 # llama-cpp-python without GPU support
-llama-cpp-python==0.1.77; platform_system != "Windows"
-https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.77/llama_cpp_python-0.1.77-cp310-cp310-win_amd64.whl; platform_system == "Windows"
+llama-cpp-python==0.1.79; platform_system != "Windows"
+https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.79/llama_cpp_python-0.1.79-cp310-cp310-win_amd64.whl; platform_system == "Windows"
+
 # llama-cpp-python with CUDA support
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.1.77+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.1.77+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.1.79+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.1.79+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+
+# llama-cpp-python with GGML support
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python_ggml-0.1.78+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python_ggml-0.1.78+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_ggml_cuda-0.1.78+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_ggml_cuda-0.1.78+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+
+# GPTQ-for-LLaMa
+https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
+https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+
+# ctransformers
+https://github.com/jllllll/ctransformers-cuBLAS-wheels/releases/download/AVX2/ctransformers-0.2.24+cu117-py3-none-any.whl
diff --git a/server.py b/server.py
index ecb8ddc9..f3f7da96 100644
--- a/server.py
+++ b/server.py
@@ -14,1083 +14,146 @@ with RequestBlocker():
 import matplotlib
 matplotlib.use('Agg')  # This fixes LaTeX rendering on some systems
 
-import importlib
 import json
-import math
 import os
-import re
 import sys
 import time
-import traceback
 from functools import partial
 from pathlib import Path
 from threading import Lock
 
-import psutil
-import torch
 import yaml
-from PIL import Image
 
 import modules.extensions as extensions_module
-from modules import chat, loaders, presets, shared, training, ui, utils
-from modules.extensions import apply_extensions
-from modules.github import clone_or_pull_repository
-from modules.html_generator import chat_html_wrapper
-from modules.LoRA import add_lora_to_model
-from modules.models import load_model, unload_model
-from modules.models_settings import (
-    apply_model_settings_to_state,
-    get_model_settings_from_yamls,
-    save_model_settings,
-    update_model_parameters
+from modules import (
+    chat,
+    shared,
+    training,
+    ui,
+    ui_chat,
+    ui_default,
+    ui_file_saving,
+    ui_model_menu,
+    ui_notebook,
+    ui_parameters,
+    ui_session,
+    utils,
 )
-from modules.text_generation import (
-    generate_reply_wrapper,
-    get_encoded_length,
-    stop_everything_event
+from modules.extensions import apply_extensions
+from modules.LoRA import add_lora_to_model
+from modules.models import load_model
+from modules.models_settings import (
+    get_model_settings_from_yamls,
+    update_model_parameters
 )
 from modules.utils import gradio
 
 
-def load_model_wrapper(selected_model, loader, autoload=False):
-    if not autoload:
-        yield f"The settings for {selected_model} have been updated.\nClick on \"Load\" to load it."
-        return
-
-    if selected_model == 'None':
-        yield "No model selected"
-    else:
-        try:
-            yield f"Loading {selected_model}..."
-            shared.model_name = selected_model
-            unload_model()
-            if selected_model != '':
-                shared.model, shared.tokenizer = load_model(shared.model_name, loader)
-
-            if shared.model is not None:
-                yield f"Successfully loaded {selected_model}"
-            else:
-                yield f"Failed to load {selected_model}."
-        except:
-            exc = traceback.format_exc()
-            logger.error('Failed to load the model.')
-            print(exc)
-            yield exc
-
-
-def load_lora_wrapper(selected_loras):
-    yield ("Applying the following LoRAs to {}:\n\n{}".format(shared.model_name, '\n'.join(selected_loras)))
-    add_lora_to_model(selected_loras)
-    yield ("Successfuly applied the LoRAs")
-
-
-def load_prompt(fname):
-    if fname in ['None', '']:
-        return ''
-    elif fname.startswith('Instruct-'):
-        fname = re.sub('^Instruct-', '', fname)
-        file_path = Path(f'characters/instruction-following/{fname}.yaml')
-        if not file_path.exists():
-            return ''
-
-        with open(file_path, 'r', encoding='utf-8') as f:
-            data = yaml.safe_load(f)
-            output = ''
-            if 'context' in data:
-                output += data['context']
-
-            replacements = {
-                '<|user|>': data['user'],
-                '<|bot|>': data['bot'],
-                '<|user-message|>': 'Input',
-            }
-
-            output += utils.replace_all(data['turn_template'].split('<|bot-message|>')[0], replacements)
-            return output.rstrip(' ')
-    else:
-        file_path = Path(f'prompts/{fname}.txt')
-        if not file_path.exists():
-            return ''
-
-        with open(file_path, 'r', encoding='utf-8') as f:
-            text = f.read()
-            if text[-1] == '\n':
-                text = text[:-1]
-
-            return text
-
-
-def count_tokens(text):
-    try:
-        tokens = get_encoded_length(text)
-        return f'{tokens} tokens in the input.'
-    except:
-        return 'Couldn\'t count the number of tokens. Is a tokenizer loaded?'
-
-
-def download_model_wrapper(repo_id, progress=gr.Progress()):
-    try:
-        downloader_module = importlib.import_module("download-model")
-        downloader = downloader_module.ModelDownloader()
-        repo_id_parts = repo_id.split(":")
-        model = repo_id_parts[0] if len(repo_id_parts) > 0 else repo_id
-        branch = repo_id_parts[1] if len(repo_id_parts) > 1 else "main"
-        check = False
-
-        progress(0.0)
-        yield ("Cleaning up the model/branch names")
-        model, branch = downloader.sanitize_model_and_branch_names(model, branch)
-
-        yield ("Getting the download links from Hugging Face")
-        links, sha256, is_lora = downloader.get_download_links_from_huggingface(model, branch, text_only=False)
-
-        yield ("Getting the output folder")
-        base_folder = shared.args.lora_dir if is_lora else shared.args.model_dir
-        output_folder = downloader.get_output_folder(model, branch, is_lora, base_folder=base_folder)
-
-        if check:
-            progress(0.5)
-            yield ("Checking previously downloaded files")
-            downloader.check_model_files(model, branch, links, sha256, output_folder)
-            progress(1.0)
-        else:
-            yield (f"Downloading files to {output_folder}")
-            downloader.download_model_files(model, branch, links, sha256, output_folder, progress_bar=progress, threads=1)
-            yield ("Done!")
-    except:
-        progress(1.0)
-        yield traceback.format_exc()
-
-
-def create_model_menus():
-    # Finding the default values for the GPU and CPU memories
-    total_mem = []
-    for i in range(torch.cuda.device_count()):
-        total_mem.append(math.floor(torch.cuda.get_device_properties(i).total_memory / (1024 * 1024)))
-
-    default_gpu_mem = []
-    if shared.args.gpu_memory is not None and len(shared.args.gpu_memory) > 0:
-        for i in shared.args.gpu_memory:
-            if 'mib' in i.lower():
-                default_gpu_mem.append(int(re.sub('[a-zA-Z ]', '', i)))
-            else:
-                default_gpu_mem.append(int(re.sub('[a-zA-Z ]', '', i)) * 1000)
-    while len(default_gpu_mem) < len(total_mem):
-        default_gpu_mem.append(0)
-
-    total_cpu_mem = math.floor(psutil.virtual_memory().total / (1024 * 1024))
-    if shared.args.cpu_memory is not None:
-        default_cpu_mem = re.sub('[a-zA-Z ]', '', shared.args.cpu_memory)
-    else:
-        default_cpu_mem = 0
-
-    with gr.Row():
-        with gr.Column():
-            with gr.Row():
-                with gr.Column():
-                    with gr.Row():
-                        shared.gradio['model_menu'] = gr.Dropdown(choices=utils.get_available_models(), value=shared.model_name, label='Model', elem_classes='slim-dropdown')
-                        ui.create_refresh_button(shared.gradio['model_menu'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button')
-                        load = gr.Button("Load", visible=not shared.settings['autoload_model'], elem_classes='refresh-button')
-                        unload = gr.Button("Unload", elem_classes='refresh-button')
-                        reload = gr.Button("Reload", elem_classes='refresh-button')
-                        save_settings = gr.Button("Save settings", elem_classes='refresh-button')
-
-                with gr.Column():
-                    with gr.Row():
-                        shared.gradio['lora_menu'] = gr.Dropdown(multiselect=True, choices=utils.get_available_loras(), value=shared.lora_names, label='LoRA(s)', elem_classes='slim-dropdown')
-                        ui.create_refresh_button(shared.gradio['lora_menu'], lambda: None, lambda: {'choices': utils.get_available_loras(), 'value': shared.lora_names}, 'refresh-button')
-                        shared.gradio['lora_menu_apply'] = gr.Button(value='Apply LoRAs', elem_classes='refresh-button')
-
-    with gr.Row():
-        with gr.Column():
-            shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=["Transformers", "ExLlama_HF", "ExLlama", "AutoGPTQ", "GPTQ-for-LLaMa", "llama.cpp", "llamacpp_HF"], value=None)
-            with gr.Box():
-                with gr.Row():
-                    with gr.Column():
-                        for i in range(len(total_mem)):
-                            shared.gradio[f'gpu_memory_{i}'] = gr.Slider(label=f"gpu-memory in MiB for device :{i}", maximum=total_mem[i], value=default_gpu_mem[i])
-
-                        shared.gradio['cpu_memory'] = gr.Slider(label="cpu-memory in MiB", maximum=total_cpu_mem, value=default_cpu_mem)
-                        shared.gradio['transformers_info'] = gr.Markdown('load-in-4bit params:')
-                        shared.gradio['compute_dtype'] = gr.Dropdown(label="compute_dtype", choices=["bfloat16", "float16", "float32"], value=shared.args.compute_dtype)
-                        shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type)
-
-                        shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=128, value=shared.args.n_gpu_layers)
-                        shared.gradio['n_ctx'] = gr.Slider(minimum=0, maximum=16384, step=256, label="n_ctx", value=shared.args.n_ctx)
-                        shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=32, value=shared.args.threads)
-                        shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, value=shared.args.n_batch)
-                        shared.gradio['n_gqa'] = gr.Slider(minimum=0, maximum=16, step=1, label="n_gqa", value=shared.args.n_gqa, info='grouped-query attention. Must be 8 for llama-2 70b.')
-                        shared.gradio['rms_norm_eps'] = gr.Slider(minimum=0, maximum=1e-5, step=1e-6, label="rms_norm_eps", value=shared.args.n_gqa, info='5e-6 is a good value for llama-2 models.')
-
-                        shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=str(shared.args.wbits) if shared.args.wbits > 0 else "None")
-                        shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=str(shared.args.groupsize) if shared.args.groupsize > 0 else "None")
-                        shared.gradio['model_type'] = gr.Dropdown(label="model_type", choices=["None", "llama", "opt", "gptj"], value=shared.args.model_type or "None")
-                        shared.gradio['pre_layer'] = gr.Slider(label="pre_layer", minimum=0, maximum=100, value=shared.args.pre_layer[0] if shared.args.pre_layer is not None else 0)
-                        shared.gradio['autogptq_info'] = gr.Markdown('* ExLlama_HF is recommended over AutoGPTQ for models derived from LLaMA.')
-                        shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
-                        shared.gradio['max_seq_len'] = gr.Slider(label='max_seq_len', minimum=2048, maximum=16384, step=256, info='Maximum sequence length.', value=shared.args.max_seq_len)
-                        shared.gradio['compress_pos_emb'] = gr.Slider(label='compress_pos_emb', minimum=1, maximum=8, step=1, info='Positional embeddings compression factor. Should typically be set to max_seq_len / 2048.', value=shared.args.compress_pos_emb)
-                        shared.gradio['alpha_value'] = gr.Slider(label='alpha_value', minimum=1, maximum=32, step=1, info='Positional embeddings alpha factor for NTK RoPE scaling. Scaling is not identical to embedding compression. Use either this or compress_pos_emb, not both.', value=shared.args.alpha_value)
-
-                    with gr.Column():
-                        shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)
-                        shared.gradio['no_inject_fused_attention'] = gr.Checkbox(label="no_inject_fused_attention", value=shared.args.no_inject_fused_attention, info='Disable fused attention. Fused attention improves inference performance but uses more VRAM. Disable if running low on VRAM.')
-                        shared.gradio['no_inject_fused_mlp'] = gr.Checkbox(label="no_inject_fused_mlp", value=shared.args.no_inject_fused_mlp, info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.')
-                        shared.gradio['no_use_cuda_fp16'] = gr.Checkbox(label="no_use_cuda_fp16", value=shared.args.no_use_cuda_fp16, info='This can make models faster on some systems.')
-                        shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.')
-                        shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu)
-                        shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
-                        shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16)
-                        shared.gradio['auto_devices'] = gr.Checkbox(label="auto-devices", value=shared.args.auto_devices)
-                        shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)
-                        shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
-                        shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant)
-                        shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap)
-                        shared.gradio['low_vram'] = gr.Checkbox(label="low-vram", value=shared.args.low_vram)
-                        shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
-                        shared.gradio['llama_cpp_seed'] = gr.Number(label='Seed (0 for random)', value=shared.args.llama_cpp_seed)
-                        shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Make sure to inspect the .py files inside the model folder before loading it with this option enabled.')
-                        shared.gradio['gptq_for_llama_info'] = gr.Markdown('GPTQ-for-LLaMa is currently 2x faster than AutoGPTQ on some systems. It is installed by default with the one-click installers. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1).')
-                        shared.gradio['exllama_info'] = gr.Markdown('For more information, consult the [docs](https://github.com/oobabooga/text-generation-webui/blob/main/docs/ExLlama.md).')
-                        shared.gradio['exllama_HF_info'] = gr.Markdown('ExLlama_HF is a wrapper that lets you use ExLlama like a Transformers model, which means it can use the Transformers samplers. It\'s a bit slower than the regular ExLlama.')
-                        shared.gradio['llamacpp_HF_info'] = gr.Markdown('llamacpp_HF is a wrapper that lets you use llama.cpp like a Transformers model, which means it can use the Transformers samplers. To use it, make sure to first download oobabooga/llama-tokenizer under "Download custom model or LoRA".')
-
-        with gr.Column():
-            with gr.Row():
-                shared.gradio['autoload_model'] = gr.Checkbox(value=shared.settings['autoload_model'], label='Autoload the model', info='Whether to load the model as soon as it is selected in the Model dropdown.')
-
-            shared.gradio['custom_model_menu'] = gr.Textbox(label="Download custom model or LoRA", info="Enter the Hugging Face username/model path, for instance: facebook/galactica-125m. To specify a branch, add it at the end after a \":\" character like this: facebook/galactica-125m:main")
-            shared.gradio['download_model_button'] = gr.Button("Download")
-
-            with gr.Row():
-                shared.gradio['model_status'] = gr.Markdown('No model is loaded' if shared.model_name == 'None' else 'Ready')
-
-    shared.gradio['loader'].change(loaders.make_loader_params_visible, gradio('loader'), gradio(loaders.get_all_params()))
-
-    # In this event handler, the interface state is read and updated
-    # with the model defaults (if any), and then the model is loaded
-    # unless "autoload_model" is unchecked
-    shared.gradio['model_menu'].change(
-        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        apply_model_settings_to_state, gradio('model_menu', 'interface_state'), gradio('interface_state')).then(
-        ui.apply_interface_values, gradio('interface_state'), gradio(ui.list_interface_input_elements()), show_progress=False).then(
-        update_model_parameters, gradio('interface_state'), None).then(
-        load_model_wrapper, gradio('model_menu', 'loader', 'autoload_model'), gradio('model_status'), show_progress=False)
-
-    load.click(
-        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        update_model_parameters, gradio('interface_state'), None).then(
-        partial(load_model_wrapper, autoload=True), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=False)
-
-    unload.click(
-        unload_model, None, None).then(
-        lambda: "Model unloaded", None, gradio('model_status'))
-
-    reload.click(
-        unload_model, None, None).then(
-        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        update_model_parameters, gradio('interface_state'), None).then(
-        partial(load_model_wrapper, autoload=True), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=False)
-
-    save_settings.click(
-        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        save_model_settings, gradio('model_menu', 'interface_state'), gradio('model_status'), show_progress=False)
-
-    shared.gradio['lora_menu_apply'].click(load_lora_wrapper, gradio('lora_menu'), gradio('model_status'), show_progress=False)
-    shared.gradio['download_model_button'].click(download_model_wrapper, gradio('custom_model_menu'), gradio('model_status'), show_progress=True)
-    shared.gradio['autoload_model'].change(lambda x: gr.update(visible=not x), gradio('autoload_model'), load)
-
-
-def create_chat_settings_menus():
-    if not shared.is_chat():
-        return
-
-    with gr.Box():
-        gr.Markdown("Chat parameters")
-        with gr.Row():
-            with gr.Column():
-                shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], step=1, label='max_new_tokens', value=shared.settings['max_new_tokens'])
-                shared.gradio['chat_generation_attempts'] = gr.Slider(minimum=shared.settings['chat_generation_attempts_min'], maximum=shared.settings['chat_generation_attempts_max'], value=shared.settings['chat_generation_attempts'], step=1, label='Generation attempts (for longer replies)', info='New generations will be called until either this number is reached or no new content is generated between two iterations.')
-
-            with gr.Column():
-                shared.gradio['stop_at_newline'] = gr.Checkbox(value=shared.settings['stop_at_newline'], label='Stop generating at new line character')
-
-
-def create_settings_menus(default_preset):
-    generate_params = presets.load_preset(default_preset)
-    with gr.Row():
-        with gr.Column():
-            with gr.Row():
-                shared.gradio['preset_menu'] = gr.Dropdown(choices=utils.get_available_presets(), value=default_preset, label='Generation parameters preset', elem_classes='slim-dropdown')
-                ui.create_refresh_button(shared.gradio['preset_menu'], lambda: None, lambda: {'choices': utils.get_available_presets()}, 'refresh-button')
-                shared.gradio['save_preset'] = gr.Button('💾', elem_classes='refresh-button')
-                shared.gradio['delete_preset'] = gr.Button('🗑️', elem_classes='refresh-button')
-
-        with gr.Column():
-            filter_by_loader = gr.Dropdown(label="Filter by loader", choices=["All", "Transformers", "ExLlama_HF", "ExLlama", "AutoGPTQ", "GPTQ-for-LLaMa", "llama.cpp", "llamacpp_HF"], value="All", elem_classes='slim-dropdown')
-
-    with gr.Row():
-        with gr.Column():
-            with gr.Box():
-                with gr.Row():
-                    with gr.Column():
-                        shared.gradio['temperature'] = gr.Slider(0.01, 1.99, value=generate_params['temperature'], step=0.01, label='temperature')
-                        shared.gradio['top_p'] = gr.Slider(0.0, 1.0, value=generate_params['top_p'], step=0.01, label='top_p')
-                        shared.gradio['top_k'] = gr.Slider(0, 200, value=generate_params['top_k'], step=1, label='top_k')
-                        shared.gradio['typical_p'] = gr.Slider(0.0, 1.0, value=generate_params['typical_p'], step=0.01, label='typical_p')
-                        shared.gradio['epsilon_cutoff'] = gr.Slider(0, 9, value=generate_params['epsilon_cutoff'], step=0.01, label='epsilon_cutoff')
-                        shared.gradio['eta_cutoff'] = gr.Slider(0, 20, value=generate_params['eta_cutoff'], step=0.01, label='eta_cutoff')
-                        shared.gradio['tfs'] = gr.Slider(0.0, 1.0, value=generate_params['tfs'], step=0.01, label='tfs')
-                        shared.gradio['top_a'] = gr.Slider(0.0, 1.0, value=generate_params['top_a'], step=0.01, label='top_a')
-
-                    with gr.Column():
-                        shared.gradio['repetition_penalty'] = gr.Slider(1.0, 1.5, value=generate_params['repetition_penalty'], step=0.01, label='repetition_penalty')
-                        shared.gradio['repetition_penalty_range'] = gr.Slider(0, 4096, step=64, value=generate_params['repetition_penalty_range'], label='repetition_penalty_range')
-                        shared.gradio['encoder_repetition_penalty'] = gr.Slider(0.8, 1.5, value=generate_params['encoder_repetition_penalty'], step=0.01, label='encoder_repetition_penalty')
-                        shared.gradio['no_repeat_ngram_size'] = gr.Slider(0, 20, step=1, value=generate_params['no_repeat_ngram_size'], label='no_repeat_ngram_size')
-                        shared.gradio['min_length'] = gr.Slider(0, 2000, step=1, value=generate_params['min_length'], label='min_length')
-                        shared.gradio['seed'] = gr.Number(value=shared.settings['seed'], label='Seed (-1 for random)')
-                        shared.gradio['do_sample'] = gr.Checkbox(value=generate_params['do_sample'], label='do_sample')
-
-            with gr.Accordion("Learn more", open=False):
-                gr.Markdown("""
-
-    For a technical description of the parameters, the [transformers documentation](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig) is a good reference.
-
-    The best presets, according to the [Preset Arena](https://github.com/oobabooga/oobabooga.github.io/blob/main/arena/results.md) experiment, are:
-
-    * Instruction following:
-        1) Divine Intellect
-        2) Big O
-        3) simple-1
-        4) Space Alien
-        5) StarChat
-        6) Titanic
-        7) tfs-with-top-a
-        8) Asterism
-        9) Contrastive Search
-
-    * Chat:
-        1) Midnight Enigma
-        2) Yara
-        3) Shortwave
-
-    ### Temperature
-    Primary factor to control randomness of outputs. 0 = deterministic (only the most likely token is used). Higher value = more randomness.
-    ### top_p
-    If not set to 1, select tokens with probabilities adding up to less than this number. Higher value = higher range of possible random results.
-    ### top_k
-    Similar to top_p, but select instead only the top_k most likely tokens. Higher value = higher range of possible random results.
-    ### typical_p
-    If not set to 1, select only tokens that are at least this much more likely to appear than random tokens, given the prior text.
-    ### epsilon_cutoff
-    In units of 1e-4; a reasonable value is 3. This sets a probability floor below which tokens are excluded from being sampled. Should be used with top_p, top_k, and eta_cutoff set to 0.
-    ### eta_cutoff
-    In units of 1e-4; a reasonable value is 3. Should be used with top_p, top_k, and epsilon_cutoff set to 0.
-    ### repetition_penalty
-    Exponential penalty factor for repeating prior tokens. 1 means no penalty, higher value = less repetition, lower value = more repetition.
-    ### repetition_penalty_range
-    The number of most recent tokens to consider for repetition penalty. 0 makes all tokens be used.
-    ### encoder_repetition_penalty
-    Also known as the "Hallucinations filter". Used to penalize tokens that are *not* in the prior text. Higher value = more likely to stay in context, lower value = more likely to diverge.
-    ### no_repeat_ngram_size
-    If not set to 0, specifies the length of token sets that are completely blocked from repeating at all. Higher values = blocks larger phrases, lower values = blocks words or letters from repeating. Only 0 or high values are a good idea in most cases.
-    ### min_length
-    Minimum generation length in tokens.
-    ### penalty_alpha
-    Contrastive Search is enabled by setting this to greater than zero and unchecking "do_sample". It should be used with a low value of top_k, for instance, top_k = 4.
-
-                """, elem_classes="markdown")
-
-        with gr.Column():
-            create_chat_settings_menus()
-            with gr.Box():
-                with gr.Row():
-                    with gr.Column():
-                        shared.gradio['mirostat_mode'] = gr.Slider(0, 2, step=1, value=generate_params['mirostat_mode'], label='mirostat_mode', info='mode=1 is for llama.cpp only.')
-                        shared.gradio['mirostat_tau'] = gr.Slider(0, 10, step=0.01, value=generate_params['mirostat_tau'], label='mirostat_tau')
-                        shared.gradio['mirostat_eta'] = gr.Slider(0, 1, step=0.01, value=generate_params['mirostat_eta'], label='mirostat_eta')
-
-                    with gr.Column():
-                        shared.gradio['penalty_alpha'] = gr.Slider(0, 5, value=generate_params['penalty_alpha'], label='penalty_alpha', info='For Contrastive Search. do_sample must be unchecked.')
-
-                        shared.gradio['num_beams'] = gr.Slider(1, 20, step=1, value=generate_params['num_beams'], label='num_beams', info='For Beam Search, along with length_penalty and early_stopping.')
-                        shared.gradio['length_penalty'] = gr.Slider(-5, 5, value=generate_params['length_penalty'], label='length_penalty')
-                        shared.gradio['early_stopping'] = gr.Checkbox(value=generate_params['early_stopping'], label='early_stopping')
-
-            with gr.Box():
-                with gr.Row():
-                    with gr.Column():
-                        shared.gradio['truncation_length'] = gr.Slider(value=shared.settings['truncation_length'], minimum=shared.settings['truncation_length_min'], maximum=shared.settings['truncation_length_max'], step=256, label='Truncate the prompt up to this length', info='The leftmost tokens are removed if the prompt exceeds this length. Most models require this to be at most 2048.')
-                        shared.gradio['custom_stopping_strings'] = gr.Textbox(lines=1, value=shared.settings["custom_stopping_strings"] or None, label='Custom stopping strings', info='In addition to the defaults. Written between "" and separated by commas. For instance: "\\nYour Assistant:", "\\nThe assistant:"')
-                    with gr.Column():
-                        shared.gradio['ban_eos_token'] = gr.Checkbox(value=shared.settings['ban_eos_token'], label='Ban the eos_token', info='Forces the model to never end the generation prematurely.')
-                        shared.gradio['add_bos_token'] = gr.Checkbox(value=shared.settings['add_bos_token'], label='Add the bos_token to the beginning of prompts', info='Disabling this can make the replies more creative.')
-
-                        shared.gradio['skip_special_tokens'] = gr.Checkbox(value=shared.settings['skip_special_tokens'], label='Skip special tokens', info='Some specific models need this unset.')
-                        shared.gradio['stream'] = gr.Checkbox(value=not shared.args.no_stream, label='Activate text streaming')
-
-    filter_by_loader.change(loaders.blacklist_samplers, filter_by_loader, gradio(loaders.list_all_samplers()), show_progress=False)
-    shared.gradio['preset_menu'].change(presets.load_preset_for_ui, gradio('preset_menu', 'interface_state'), gradio('interface_state', 'do_sample', 'temperature', 'top_p', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'repetition_penalty', 'repetition_penalty_range', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'tfs', 'top_a'))
-
-
-def create_file_saving_menus():
-
-    # Text file saver
-    with gr.Box(visible=False, elem_classes='file-saver') as shared.gradio['file_saver']:
-        shared.gradio['save_filename'] = gr.Textbox(lines=1, label='File name')
-        shared.gradio['save_root'] = gr.Textbox(lines=1, label='File folder', info='For reference. Unchangeable.', interactive=False)
-        shared.gradio['save_contents'] = gr.Textbox(lines=10, label='File contents')
-        with gr.Row():
-            shared.gradio['save_confirm'] = gr.Button('Save', elem_classes="small-button")
-            shared.gradio['save_cancel'] = gr.Button('Cancel', elem_classes="small-button")
-
-    # Text file deleter
-    with gr.Box(visible=False, elem_classes='file-saver') as shared.gradio['file_deleter']:
-        shared.gradio['delete_filename'] = gr.Textbox(lines=1, label='File name')
-        shared.gradio['delete_root'] = gr.Textbox(lines=1, label='File folder', info='For reference. Unchangeable.', interactive=False)
-        with gr.Row():
-            shared.gradio['delete_confirm'] = gr.Button('Delete', elem_classes="small-button", variant='stop')
-            shared.gradio['delete_cancel'] = gr.Button('Cancel', elem_classes="small-button")
-
-    # Character saver/deleter
-    if shared.is_chat():
-        with gr.Box(visible=False, elem_classes='file-saver') as shared.gradio['character_saver']:
-            shared.gradio['save_character_filename'] = gr.Textbox(lines=1, label='File name', info='The character will be saved to your characters/ folder with this base filename.')
-            with gr.Row():
-                shared.gradio['save_character_confirm'] = gr.Button('Save', elem_classes="small-button")
-                shared.gradio['save_character_cancel'] = gr.Button('Cancel', elem_classes="small-button")
-
-        with gr.Box(visible=False, elem_classes='file-saver') as shared.gradio['character_deleter']:
-            gr.Markdown('Confirm the character deletion?')
-            with gr.Row():
-                shared.gradio['delete_character_confirm'] = gr.Button('Delete', elem_classes="small-button", variant='stop')
-                shared.gradio['delete_character_cancel'] = gr.Button('Cancel', elem_classes="small-button")
-
-
-def create_file_saving_event_handlers():
-    shared.gradio['save_confirm'].click(
-        lambda x, y, z: utils.save_file(x + y, z), gradio('save_root', 'save_filename', 'save_contents'), None).then(
-        lambda: gr.update(visible=False), None, gradio('file_saver'))
-
-    shared.gradio['delete_confirm'].click(
-        lambda x, y: utils.delete_file(x + y), gradio('delete_root', 'delete_filename'), None).then(
-        lambda: gr.update(visible=False), None, gradio('file_deleter'))
-
-    shared.gradio['delete_cancel'].click(lambda: gr.update(visible=False), None, gradio('file_deleter'))
-    shared.gradio['save_cancel'].click(lambda: gr.update(visible=False), None, gradio('file_saver'))
-    if shared.is_chat():
-        shared.gradio['save_character_confirm'].click(
-            chat.save_character, gradio('name2', 'greeting', 'context', 'character_picture', 'save_character_filename'), None).then(
-            lambda: gr.update(visible=False), None, gradio('character_saver'))
-
-        shared.gradio['delete_character_confirm'].click(
-            chat.delete_character, gradio('character_menu'), None).then(
-            lambda: gr.update(visible=False), None, gradio('character_deleter')).then(
-            lambda: gr.update(choices=utils.get_available_characters()), None, gradio('character_menu'))
-
-        shared.gradio['save_character_cancel'].click(lambda: gr.update(visible=False), None, gradio('character_saver'))
-        shared.gradio['delete_character_cancel'].click(lambda: gr.update(visible=False), None, gradio('character_deleter'))
-
-    shared.gradio['save_preset'].click(
-        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        presets.generate_preset_yaml, gradio('interface_state'), gradio('save_contents')).then(
-        lambda: 'presets/', None, gradio('save_root')).then(
-        lambda: 'My Preset.yaml', None, gradio('save_filename')).then(
-        lambda: gr.update(visible=True), None, gradio('file_saver'))
-
-    shared.gradio['delete_preset'].click(
-        lambda x: f'{x}.yaml', gradio('preset_menu'), gradio('delete_filename')).then(
-        lambda: 'presets/', None, gradio('delete_root')).then(
-        lambda: gr.update(visible=True), None, gradio('file_deleter'))
-
-    if not shared.args.multi_user:
-
-        def load_session(session, state):
-            with open(Path(f'logs/{session}.json'), 'r') as f:
-                state.update(json.loads(f.read()))
-
-            if shared.is_chat():
-                chat.save_persistent_history(state['history'], state['character_menu'], state['mode'])
-
-            return state
-
-        if shared.is_chat():
-            shared.gradio['save_session'].click(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                lambda x: json.dumps(x, indent=4), gradio('interface_state'), gradio('save_contents')).then(
-                lambda: 'logs/', None, gradio('save_root')).then(
-                lambda x: f'session_{shared.get_mode()}_{x + "_" if x not in ["None", None, ""] else ""}{utils.current_time()}.json', gradio('character_menu'), gradio('save_filename')).then(
-                lambda: gr.update(visible=True), None, gradio('file_saver'))
-
-            shared.gradio['session_menu'].change(
-                load_session, gradio('session_menu', 'interface_state'), gradio('interface_state')).then(
-                ui.apply_interface_values, gradio('interface_state'), gradio(ui.list_interface_input_elements()), show_progress=False).then(
-                chat.redraw_html, shared.reload_inputs, gradio('display'))
-
-        else:
-            shared.gradio['save_session'].click(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                lambda x: json.dumps(x, indent=4), gradio('interface_state'), gradio('save_contents')).then(
-                lambda: 'logs/', None, gradio('save_root')).then(
-                lambda: f'session_{shared.get_mode()}_{utils.current_time()}.json', None, gradio('save_filename')).then(
-                lambda: gr.update(visible=True), None, gradio('file_saver'))
-
-            shared.gradio['session_menu'].change(
-                load_session, gradio('session_menu', 'interface_state'), gradio('interface_state')).then(
-                ui.apply_interface_values, gradio('interface_state'), gradio(ui.list_interface_input_elements()), show_progress=False)
-
-        shared.gradio['delete_session'].click(
-            lambda x: f'{x}.json', gradio('session_menu'), gradio('delete_filename')).then(
-            lambda: 'logs/', None, gradio('delete_root')).then(
-            lambda: gr.update(visible=True), None, gradio('file_deleter'))
-
-
-def set_interface_arguments(interface_mode, extensions, bool_active):
-    modes = ["default", "notebook", "chat", "cai_chat"]
-    cmd_list = vars(shared.args)
-    bool_list = [k for k in cmd_list if type(cmd_list[k]) is bool and k not in modes]
-
-    shared.args.extensions = extensions
-    for k in modes[1:]:
-        setattr(shared.args, k, False)
-    if interface_mode != "default":
-        setattr(shared.args, interface_mode, True)
-
-    for k in bool_list:
-        setattr(shared.args, k, False)
-    for k in bool_active:
-        setattr(shared.args, k, True)
-
-    shared.need_restart = True
-
-
 def create_interface():
 
-    # Defining some variables
-    gen_events = []
-    default_preset = shared.settings['preset']
-    default_text = load_prompt(shared.settings['prompt'])
     title = 'Text generation web UI'
 
-    # Authentication variables
-    auth = None
-    gradio_auth_creds = []
+    # Password authentication
+    auth = []
     if shared.args.gradio_auth:
-        gradio_auth_creds += [x.strip() for x in shared.args.gradio_auth.strip('"').replace('\n', '').split(',') if x.strip()]
-    if shared.args.gradio_auth_path is not None:
+        auth.extend(x.strip() for x in shared.args.gradio_auth.strip('"').replace('\n', '').split(',') if x.strip())
+    if shared.args.gradio_auth_path:
         with open(shared.args.gradio_auth_path, 'r', encoding="utf8") as file:
-            for line in file.readlines():
-                gradio_auth_creds += [x.strip() for x in line.split(',') if x.strip()]
-    if gradio_auth_creds:
-        auth = [tuple(cred.split(':')) for cred in gradio_auth_creds]
+            auth.extend(x.strip() for line in file for x in line.split(',') if x.strip())
+    auth = [tuple(cred.split(':')) for cred in auth]
 
-    # Importing the extension files and executing their setup() functions
+    # Import the extensions and execute their setup() functions
     if shared.args.extensions is not None and len(shared.args.extensions) > 0:
         extensions_module.load_extensions()
 
-    # Forcing some events to be triggered on page load
+    # Force some events to be triggered on page load
     shared.persistent_interface_state.update({
         'loader': shared.args.loader or 'Transformers',
+        'mode': shared.settings['mode'],
+        'character_menu': shared.args.character or shared.settings['character'],
+        'instruction_template': shared.settings['instruction_template'],
+        'prompt_menu-default': shared.settings['prompt-default'],
+        'prompt_menu-notebook': shared.settings['prompt-notebook'],
     })
 
-    if shared.is_chat():
-        shared.persistent_interface_state.update({
-            'mode': shared.settings['mode'],
-            'character_menu': shared.args.character or shared.settings['character'],
-            'instruction_template': shared.settings['instruction_template']
-        })
-
-        if Path("cache/pfp_character.png").exists():
-            Path("cache/pfp_character.png").unlink()
+    if Path("cache/pfp_character.png").exists():
+        Path("cache/pfp_character.png").unlink()
 
     # css/js strings
-    css = ui.css if not shared.is_chat() else ui.css + ui.chat_css
-    js = ui.main_js if not shared.is_chat() else ui.main_js + ui.chat_js
+    css = ui.css
+    js = ui.js
     css += apply_extensions('css')
     js += apply_extensions('js')
 
+    # Interface state elements
+    shared.input_elements = ui.list_interface_input_elements()
+
     with gr.Blocks(css=css, analytics_enabled=False, title=title, theme=ui.theme) as shared.gradio['interface']:
+
+        # Interface state
+        shared.gradio['interface_state'] = gr.State({k: None for k in shared.input_elements})
+
+        # Audio notification
         if Path("notification.mp3").exists():
             shared.gradio['audio_notification'] = gr.Audio(interactive=False, value="notification.mp3", elem_id="audio_notification", visible=False)
-            audio_notification_js = "document.querySelector('#audio_notification audio')?.play();"
-        else:
-            audio_notification_js = ""
 
         # Floating menus for saving/deleting files
-        create_file_saving_menus()
+        ui_file_saving.create_ui()
 
-        # Create chat mode interface
-        if shared.is_chat():
-            shared.input_elements = ui.list_interface_input_elements()
+        # Temporary clipboard for saving files
+        shared.gradio['temporary_text'] = gr.Textbox(visible=False)
 
-            shared.gradio.update({
-                'interface_state': gr.State({k: None for k in shared.input_elements}),
-                'Chat input': gr.State(),
-                'dummy': gr.State(),
-                'history': gr.State({'internal': [], 'visible': []}),
-            })
+        # Text Generation tab
+        ui_chat.create_ui()
+        ui_default.create_ui()
+        ui_notebook.create_ui()
 
-            with gr.Tab('Text generation', elem_id='main'):
-                shared.gradio['display'] = gr.HTML(value=chat_html_wrapper({'internal': [], 'visible': []}, shared.settings['name1'], shared.settings['name2'], 'chat', 'cai-chat'))
-                shared.gradio['textbox'] = gr.Textbox(label='Input')
-                with gr.Row():
-                    shared.gradio['Stop'] = gr.Button('Stop', elem_id='stop')
-                    shared.gradio['Generate'] = gr.Button('Generate', elem_id='Generate', variant='primary')
-                    shared.gradio['Continue'] = gr.Button('Continue')
+        ui_parameters.create_ui(shared.settings['preset'])  # Parameters tab
+        ui_model_menu.create_ui()  # Model tab
+        training.create_ui()  # Training tab
+        ui_session.create_ui()  # Session tab
 
-                with gr.Row():
-                    shared.gradio['Impersonate'] = gr.Button('Impersonate')
-                    shared.gradio['Regenerate'] = gr.Button('Regenerate')
-                    shared.gradio['Remove last'] = gr.Button('Remove last')
+        # Generation events
+        ui_chat.create_event_handlers()
+        ui_default.create_event_handlers()
+        ui_notebook.create_event_handlers()
 
-                with gr.Row():
-                    shared.gradio['Copy last reply'] = gr.Button('Copy last reply')
-                    shared.gradio['Replace last reply'] = gr.Button('Replace last reply')
-                    shared.gradio['Send dummy message'] = gr.Button('Send dummy message')
-                    shared.gradio['Send dummy reply'] = gr.Button('Send dummy reply')
-
-                with gr.Row():
-                    shared.gradio['Clear history'] = gr.Button('Clear history')
-                    shared.gradio['Clear history-confirm'] = gr.Button('Confirm', variant='stop', visible=False)
-                    shared.gradio['Clear history-cancel'] = gr.Button('Cancel', visible=False)
-
-                with gr.Row():
-                    shared.gradio['start_with'] = gr.Textbox(label='Start reply with', placeholder='Sure thing!', value=shared.settings['start_with'])
-
-                with gr.Row():
-                    shared.gradio['mode'] = gr.Radio(choices=['chat', 'chat-instruct', 'instruct'], value=shared.settings['mode'] if shared.settings['mode'] in ['chat', 'instruct', 'chat-instruct'] else 'chat', label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template selected under "Chat settings" must match the current model.')
-                    shared.gradio['chat_style'] = gr.Dropdown(choices=utils.get_available_chat_styles(), label='Chat style', value=shared.settings['chat_style'], visible=shared.settings['mode'] != 'instruct')
-
-            with gr.Tab('Chat settings', elem_id='chat-settings'):
-
-                with gr.Tab("Character"):
-                    with gr.Row():
-                        with gr.Column(scale=8):
-                            with gr.Row():
-                                shared.gradio['character_menu'] = gr.Dropdown(value='None', choices=utils.get_available_characters(), label='Character', elem_id='character-menu', info='Used in chat and chat-instruct modes.', elem_classes='slim-dropdown')
-                                ui.create_refresh_button(shared.gradio['character_menu'], lambda: None, lambda: {'choices': utils.get_available_characters()}, 'refresh-button')
-                                shared.gradio['save_character'] = gr.Button('💾', elem_classes='refresh-button')
-                                shared.gradio['delete_character'] = gr.Button('🗑️', elem_classes='refresh-button')
-
-                            shared.gradio['name1'] = gr.Textbox(value=shared.settings['name1'], lines=1, label='Your name')
-                            shared.gradio['name2'] = gr.Textbox(value=shared.settings['name2'], lines=1, label='Character\'s name')
-                            shared.gradio['context'] = gr.Textbox(value=shared.settings['context'], lines=4, label='Context', elem_classes=['add_scrollbar'])
-                            shared.gradio['greeting'] = gr.Textbox(value=shared.settings['greeting'], lines=4, label='Greeting', elem_classes=['add_scrollbar'])
-
-                        with gr.Column(scale=1):
-                            shared.gradio['character_picture'] = gr.Image(label='Character picture', type='pil')
-                            shared.gradio['your_picture'] = gr.Image(label='Your picture', type='pil', value=Image.open(Path('cache/pfp_me.png')) if Path('cache/pfp_me.png').exists() else None)
-
-                with gr.Tab("Instruction template"):
-                    with gr.Row():
-                        with gr.Row():
-                            shared.gradio['instruction_template'] = gr.Dropdown(choices=utils.get_available_instruction_templates(), label='Instruction template', value='None', info='Change this according to the model/LoRA that you are using. Used in instruct and chat-instruct modes.', elem_classes='slim-dropdown')
-                            ui.create_refresh_button(shared.gradio['instruction_template'], lambda: None, lambda: {'choices': utils.get_available_instruction_templates()}, 'refresh-button')
-                            shared.gradio['save_template'] = gr.Button('💾', elem_classes='refresh-button')
-                            shared.gradio['delete_template'] = gr.Button('🗑️ ', elem_classes='refresh-button')
-
-                    shared.gradio['name1_instruct'] = gr.Textbox(value='', lines=2, label='User string')
-                    shared.gradio['name2_instruct'] = gr.Textbox(value='', lines=1, label='Bot string')
-                    shared.gradio['context_instruct'] = gr.Textbox(value='', lines=4, label='Context')
-                    shared.gradio['turn_template'] = gr.Textbox(value=shared.settings['turn_template'], lines=1, label='Turn template', info='Used to precisely define the placement of spaces and new line characters in instruction prompts.')
-                    with gr.Row():
-                        shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=4, label='Command for chat-instruct mode', info='<|character|> gets replaced by the bot name, and <|prompt|> gets replaced by the regular chat prompt.', elem_classes=['add_scrollbar'])
-
-                with gr.Tab('Chat history'):
-                    with gr.Row():
-                        with gr.Column():
-                            shared.gradio['download'] = gr.File(label="Download")
-                            shared.gradio['download_button'] = gr.Button(value='Refresh')
-
-                        with gr.Column():
-                            shared.gradio['upload_chat_history'] = gr.File(type='binary', file_types=['.json', '.txt'], label="Upload")
-
-                with gr.Tab('Upload character'):
-                    with gr.Tab('YAML or JSON'):
-                        with gr.Row():
-                            shared.gradio['upload_json'] = gr.File(type='binary', file_types=['.json', '.yaml'], label='JSON or YAML File')
-                            shared.gradio['upload_img_bot'] = gr.Image(type='pil', label='Profile Picture (optional)')
-
-                        shared.gradio['Submit character'] = gr.Button(value='Submit', interactive=False)
-
-                    with gr.Tab('TavernAI PNG'):
-                        with gr.Row():
-                            with gr.Column():
-                                shared.gradio['upload_img_tavern'] = gr.Image(type='pil', label='TavernAI PNG File', elem_id="upload_img_tavern")
-                                shared.gradio['tavern_json'] = gr.State()
-                            with gr.Column():
-                                shared.gradio['tavern_name'] = gr.Textbox(value='', lines=1, label='Name', interactive=False)
-                                shared.gradio['tavern_desc'] = gr.Textbox(value='', lines=4, max_lines=4, label='Description', interactive=False)
-
-                        shared.gradio['Submit tavern character'] = gr.Button(value='Submit', interactive=False)
-
-            with gr.Tab("Parameters", elem_id="parameters"):
-                create_settings_menus(default_preset)
-
-        # Create notebook mode interface
-        elif shared.args.notebook:
-            shared.input_elements = ui.list_interface_input_elements()
-            shared.gradio['interface_state'] = gr.State({k: None for k in shared.input_elements})
-            shared.gradio['last_input'] = gr.State('')
-            with gr.Tab("Text generation", elem_id="main"):
-                with gr.Row():
-                    with gr.Column(scale=4):
-                        with gr.Tab('Raw'):
-                            shared.gradio['textbox'] = gr.Textbox(value=default_text, elem_classes=['textbox', 'add_scrollbar'], lines=27)
-
-                        with gr.Tab('Markdown'):
-                            shared.gradio['markdown_render'] = gr.Button('Render')
-                            shared.gradio['markdown'] = gr.Markdown()
-
-                        with gr.Tab('HTML'):
-                            shared.gradio['html'] = gr.HTML()
-
-                        with gr.Row():
-                            shared.gradio['Generate'] = gr.Button('Generate', variant='primary', elem_classes="small-button")
-                            shared.gradio['Stop'] = gr.Button('Stop', elem_classes="small-button")
-                            shared.gradio['Undo'] = gr.Button('Undo', elem_classes="small-button")
-                            shared.gradio['Regenerate'] = gr.Button('Regenerate', elem_classes="small-button")
-
-                    with gr.Column(scale=1):
-                        gr.HTML('<div style="padding-bottom: 13px"></div>')
-                        shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], step=1, label='max_new_tokens', value=shared.settings['max_new_tokens'])
-                        with gr.Row():
-                            shared.gradio['prompt_menu'] = gr.Dropdown(choices=utils.get_available_prompts(), value='None', label='Prompt', elem_classes='slim-dropdown')
-                            ui.create_refresh_button(shared.gradio['prompt_menu'], lambda: None, lambda: {'choices': utils.get_available_prompts()}, ['refresh-button', 'refresh-button-small'])
-                            shared.gradio['save_prompt'] = gr.Button('💾', elem_classes=['refresh-button', 'refresh-button-small'])
-                            shared.gradio['delete_prompt'] = gr.Button('🗑️', elem_classes=['refresh-button', 'refresh-button-small'])
-
-                        shared.gradio['count_tokens'] = gr.Button('Count tokens')
-                        shared.gradio['status'] = gr.Markdown('')
-
-            with gr.Tab("Parameters", elem_id="parameters"):
-                create_settings_menus(default_preset)
-
-        # Create default mode interface
-        else:
-            shared.input_elements = ui.list_interface_input_elements()
-            shared.gradio['interface_state'] = gr.State({k: None for k in shared.input_elements})
-            shared.gradio['last_input'] = gr.State('')
-            with gr.Tab("Text generation", elem_id="main"):
-                with gr.Row():
-                    with gr.Column():
-                        shared.gradio['textbox'] = gr.Textbox(value=default_text, elem_classes=['textbox_default', 'add_scrollbar'], lines=27, label='Input')
-                        shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], step=1, label='max_new_tokens', value=shared.settings['max_new_tokens'])
-                        with gr.Row():
-                            shared.gradio['Generate'] = gr.Button('Generate', variant='primary')
-                            shared.gradio['Stop'] = gr.Button('Stop')
-                            shared.gradio['Continue'] = gr.Button('Continue')
-                            shared.gradio['count_tokens'] = gr.Button('Count tokens')
-
-                        with gr.Row():
-                            shared.gradio['prompt_menu'] = gr.Dropdown(choices=utils.get_available_prompts(), value='None', label='Prompt', elem_classes='slim-dropdown')
-                            ui.create_refresh_button(shared.gradio['prompt_menu'], lambda: None, lambda: {'choices': utils.get_available_prompts()}, 'refresh-button')
-                            shared.gradio['save_prompt'] = gr.Button('💾', elem_classes='refresh-button')
-                            shared.gradio['delete_prompt'] = gr.Button('🗑️', elem_classes='refresh-button')
-
-                        shared.gradio['status'] = gr.Markdown('')
-
-                    with gr.Column():
-                        with gr.Tab('Raw'):
-                            shared.gradio['output_textbox'] = gr.Textbox(lines=27, label='Output', elem_classes=['textbox_default_output', 'add_scrollbar'])
-
-                        with gr.Tab('Markdown'):
-                            shared.gradio['markdown_render'] = gr.Button('Render')
-                            shared.gradio['markdown'] = gr.Markdown()
-
-                        with gr.Tab('HTML'):
-                            shared.gradio['html'] = gr.HTML()
-
-            with gr.Tab("Parameters", elem_id="parameters"):
-                create_settings_menus(default_preset)
-
-        # Model tab
-        with gr.Tab("Model", elem_id="model-tab"):
-            create_model_menus()
-
-        # Training tab
-        with gr.Tab("Training", elem_id="training-tab"):
-            training.create_train_interface()
-
-        # Session tab
-        with gr.Tab("Session", elem_id="session-tab"):
-            modes = ["default", "notebook", "chat"]
-            current_mode = "default"
-            for mode in modes[1:]:
-                if getattr(shared.args, mode):
-                    current_mode = mode
-                    break
-
-            cmd_list = vars(shared.args)
-            bool_list = sorted([k for k in cmd_list if type(cmd_list[k]) is bool and k not in modes + ui.list_model_elements()])
-            bool_active = [k for k in bool_list if vars(shared.args)[k]]
-
-            with gr.Row():
-
-                with gr.Column():
-                    with gr.Row():
-                        shared.gradio['interface_modes_menu'] = gr.Dropdown(choices=modes, value=current_mode, label="Mode", elem_classes='slim-dropdown')
-                        shared.gradio['reset_interface'] = gr.Button("Apply and restart", elem_classes="small-button", variant="primary")
-                        shared.gradio['toggle_dark_mode'] = gr.Button('Toggle 💡', elem_classes="small-button")
-
-                    with gr.Row():
-                        with gr.Column():
-                            shared.gradio['extensions_menu'] = gr.CheckboxGroup(choices=utils.get_available_extensions(), value=shared.args.extensions, label="Available extensions", info='Note that some of these extensions may require manually installing Python requirements through the command: pip install -r extensions/extension_name/requirements.txt', elem_classes='checkboxgroup-table')
-
-                        with gr.Column():
-                            shared.gradio['bool_menu'] = gr.CheckboxGroup(choices=bool_list, value=bool_active, label="Boolean command-line flags", elem_classes='checkboxgroup-table')
-
-                with gr.Column():
-                    if not shared.args.multi_user:
-                        with gr.Row():
-                            shared.gradio['session_menu'] = gr.Dropdown(choices=utils.get_available_sessions(), value='None', label='Session', elem_classes='slim-dropdown', info='When saving a session, make sure to keep the initial part of the filename (session_chat, session_notebook, or session_default), otherwise it will not appear on this list afterwards.')
-                            ui.create_refresh_button(shared.gradio['session_menu'], lambda: None, lambda: {'choices': utils.get_available_sessions()}, ['refresh-button'])
-                            shared.gradio['save_session'] = gr.Button('💾', elem_classes=['refresh-button'])
-                            shared.gradio['delete_session'] = gr.Button('🗑️', elem_classes=['refresh-button'])
-
-                    extension_name = gr.Textbox(lines=1, label='Install or update an extension', info='Enter the GitHub URL below and press Enter. For a list of extensions, see: https://github.com/oobabooga/text-generation-webui-extensions ⚠️  WARNING ⚠️ : extensions can execute arbitrary code. Make sure to inspect their source code before activating them.')
-                    extension_status = gr.Markdown()
-
-            extension_name.submit(
-                clone_or_pull_repository, extension_name, extension_status, show_progress=False).then(
-                lambda: gr.update(choices=utils.get_available_extensions(), value=shared.args.extensions), None, gradio('extensions_menu'))
-
-            # Reset interface event
-            shared.gradio['reset_interface'].click(
-                set_interface_arguments, gradio('interface_modes_menu', 'extensions_menu', 'bool_menu'), None).then(
-                lambda: None, None, None, _js='() => {document.body.innerHTML=\'<h1 style="font-family:monospace;padding-top:20%;margin:0;height:100vh;color:lightgray;text-align:center;background:var(--body-background-fill)">Reloading...</h1>\'; setTimeout(function(){location.reload()},2500); return []}')
-
-            shared.gradio['toggle_dark_mode'].click(lambda: None, None, None, _js='() => {document.getElementsByTagName("body")[0].classList.toggle("dark")}')
-
-        # chat mode event handlers
-        if shared.is_chat():
-            shared.input_params = gradio('Chat input', 'start_with', 'interface_state')
-            clear_arr = gradio('Clear history-confirm', 'Clear history', 'Clear history-cancel')
-            shared.reload_inputs = gradio('history', 'name1', 'name2', 'mode', 'chat_style')
-
-            gen_events.append(shared.gradio['Generate'].click(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                lambda x: (x, ''), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then(
-                chat.generate_chat_reply_wrapper, shared.input_params, gradio('display', 'history'), show_progress=False).then(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
-                lambda: None, None, None, _js=f"() => {{{audio_notification_js}}}")
-            )
-
-            gen_events.append(shared.gradio['textbox'].submit(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                lambda x: (x, ''), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then(
-                chat.generate_chat_reply_wrapper, shared.input_params, gradio('display', 'history'), show_progress=False).then(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
-                lambda: None, None, None, _js=f"() => {{{audio_notification_js}}}")
-            )
-
-            gen_events.append(shared.gradio['Regenerate'].click(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                partial(chat.generate_chat_reply_wrapper, regenerate=True), shared.input_params, gradio('display', 'history'), show_progress=False).then(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
-                lambda: None, None, None, _js=f"() => {{{audio_notification_js}}}")
-            )
-
-            gen_events.append(shared.gradio['Continue'].click(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                partial(chat.generate_chat_reply_wrapper, _continue=True), shared.input_params, gradio('display', 'history'), show_progress=False).then(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
-                lambda: None, None, None, _js=f"() => {{{audio_notification_js}}}")
-            )
-
-            gen_events.append(shared.gradio['Impersonate'].click(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                lambda x: x, gradio('textbox'), gradio('Chat input'), show_progress=False).then(
-                chat.impersonate_wrapper, shared.input_params, gradio('textbox'), show_progress=False).then(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                lambda: None, None, None, _js=f"() => {{{audio_notification_js}}}")
-            )
-
-            shared.gradio['Replace last reply'].click(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                chat.replace_last_reply, gradio('textbox', 'interface_state'), gradio('history')).then(
-                lambda: '', None, gradio('textbox'), show_progress=False).then(
-                chat.redraw_html, shared.reload_inputs, gradio('display')).then(
-                chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None)
-
-            shared.gradio['Send dummy message'].click(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                chat.send_dummy_message, gradio('textbox', 'interface_state'), gradio('history')).then(
-                lambda: '', None, gradio('textbox'), show_progress=False).then(
-                chat.redraw_html, shared.reload_inputs, gradio('display')).then(
-                chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None)
-
-            shared.gradio['Send dummy reply'].click(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                chat.send_dummy_reply, gradio('textbox', 'interface_state'), gradio('history')).then(
-                lambda: '', None, gradio('textbox'), show_progress=False).then(
-                chat.redraw_html, shared.reload_inputs, gradio('display')).then(
-                chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None)
-
-            shared.gradio['Clear history'].click(lambda: [gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)], None, clear_arr)
-            shared.gradio['Clear history-cancel'].click(lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, clear_arr)
-            shared.gradio['Clear history-confirm'].click(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, clear_arr).then(
-                chat.clear_chat_log, gradio('interface_state'), gradio('history')).then(
-                chat.redraw_html, shared.reload_inputs, gradio('display')).then(
-                chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None)
-
-            shared.gradio['Remove last'].click(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                chat.remove_last_message, gradio('history'), gradio('textbox', 'history'), show_progress=False).then(
-                chat.redraw_html, shared.reload_inputs, gradio('display')).then(
-                chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None)
-
-            shared.gradio['character_menu'].change(
-                partial(chat.load_character, instruct=False), gradio('character_menu', 'name1', 'name2'), gradio('name1', 'name2', 'character_picture', 'greeting', 'context', 'dummy')).then(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                chat.load_persistent_history, gradio('interface_state'), gradio('history')).then(
-                chat.redraw_html, shared.reload_inputs, gradio('display'))
-
-            shared.gradio['Stop'].click(
-                stop_everything_event, None, None, queue=False, cancels=gen_events if shared.args.no_stream else None).then(
-                chat.redraw_html, shared.reload_inputs, gradio('display'))
-
-            shared.gradio['mode'].change(
-                lambda x: gr.update(visible=x != 'instruct'), gradio('mode'), gradio('chat_style'), show_progress=False).then(
-                chat.redraw_html, shared.reload_inputs, gradio('display'))
-
-            shared.gradio['chat_style'].change(chat.redraw_html, shared.reload_inputs, gradio('display'))
-            shared.gradio['instruction_template'].change(
-                partial(chat.load_character, instruct=True), gradio('instruction_template', 'name1_instruct', 'name2_instruct'), gradio('name1_instruct', 'name2_instruct', 'dummy', 'dummy', 'context_instruct', 'turn_template'))
-
-            shared.gradio['upload_chat_history'].upload(
-                chat.load_history, gradio('upload_chat_history', 'history'), gradio('history')).then(
-                chat.redraw_html, shared.reload_inputs, gradio('display'))
-
-            shared.gradio['Copy last reply'].click(chat.send_last_reply_to_input, gradio('history'), gradio('textbox'), show_progress=False)
-
-            # Save/delete a character
-            shared.gradio['save_character'].click(
-                lambda x: x, gradio('name2'), gradio('save_character_filename')).then(
-                lambda: gr.update(visible=True), None, gradio('character_saver'))
-
-            shared.gradio['delete_character'].click(lambda: gr.update(visible=True), None, gradio('character_deleter'))
-
-            shared.gradio['save_template'].click(
-                lambda: 'My Template.yaml', None, gradio('save_filename')).then(
-                lambda: 'characters/instruction-following/', None, gradio('save_root')).then(
-                chat.generate_instruction_template_yaml, gradio('name1_instruct', 'name2_instruct', 'context_instruct', 'turn_template'), gradio('save_contents')).then(
-                lambda: gr.update(visible=True), None, gradio('file_saver'))
-
-            shared.gradio['delete_template'].click(
-                lambda x: f'{x}.yaml', gradio('instruction_template'), gradio('delete_filename')).then(
-                lambda: 'characters/instruction-following/', None, gradio('delete_root')).then(
-                lambda: gr.update(visible=True), None, gradio('file_deleter'))
-
-            shared.gradio['download_button'].click(chat.save_history_at_user_request, gradio('history', 'character_menu', 'mode'), gradio('download'))
-            shared.gradio['Submit character'].click(chat.upload_character, gradio('upload_json', 'upload_img_bot'), gradio('character_menu'))
-            shared.gradio['upload_json'].upload(lambda: gr.update(interactive=True), None, gradio('Submit character'))
-            shared.gradio['upload_json'].clear(lambda: gr.update(interactive=False), None, gradio('Submit character'))
-
-            shared.gradio['Submit tavern character'].click(chat.upload_tavern_character, gradio('upload_img_tavern', 'tavern_json'), gradio('character_menu'))
-            shared.gradio['upload_img_tavern'].upload(chat.check_tavern_character, gradio('upload_img_tavern'), gradio('tavern_name', 'tavern_desc', 'tavern_json', 'Submit tavern character'), show_progress=False)
-            shared.gradio['upload_img_tavern'].clear(lambda: (None, None, None, gr.update(interactive=False)), None, gradio('tavern_name', 'tavern_desc', 'tavern_json', 'Submit tavern character'), show_progress=False)
-            shared.gradio['your_picture'].change(
-                chat.upload_your_profile_picture, gradio('your_picture'), None).then(
-                partial(chat.redraw_html, reset_cache=True), shared.reload_inputs, gradio('display'))
-
-        # notebook/default modes event handlers
-        else:
-            shared.input_params = gradio('textbox', 'interface_state')
-            if shared.args.notebook:
-                output_params = gradio('textbox', 'html')
-            else:
-                output_params = gradio('output_textbox', 'html')
-
-            gen_events.append(shared.gradio['Generate'].click(
-                lambda x: x, gradio('textbox'), gradio('last_input')).then(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                generate_reply_wrapper, shared.input_params, output_params, show_progress=False).then(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                lambda: None, None, None, _js=f"() => {{{audio_notification_js}}}")
-                # lambda: None, None, None, _js="() => {element = document.getElementsByTagName('textarea')[0]; element.scrollTop = element.scrollHeight}")
-            )
-
-            gen_events.append(shared.gradio['textbox'].submit(
-                lambda x: x, gradio('textbox'), gradio('last_input')).then(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                generate_reply_wrapper, shared.input_params, output_params, show_progress=False).then(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                lambda: None, None, None, _js=f"() => {{{audio_notification_js}}}")
-                # lambda: None, None, None, _js="() => {element = document.getElementsByTagName('textarea')[0]; element.scrollTop = element.scrollHeight}")
-            )
-
-            if shared.args.notebook:
-                shared.gradio['Undo'].click(lambda x: x, gradio('last_input'), gradio('textbox'), show_progress=False)
-                shared.gradio['markdown_render'].click(lambda x: x, gradio('textbox'), gradio('markdown'), queue=False)
-                gen_events.append(shared.gradio['Regenerate'].click(
-                    lambda x: x, gradio('last_input'), gradio('textbox'), show_progress=False).then(
-                    ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                    generate_reply_wrapper, shared.input_params, output_params, show_progress=False).then(
-                    ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                    lambda: None, None, None, _js=f"() => {{{audio_notification_js}}}")
-                    # lambda: None, None, None, _js="() => {element = document.getElementsByTagName('textarea')[0]; element.scrollTop = element.scrollHeight}")
-                )
-            else:
-                shared.gradio['markdown_render'].click(lambda x: x, gradio('output_textbox'), gradio('markdown'), queue=False)
-                gen_events.append(shared.gradio['Continue'].click(
-                    ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                    generate_reply_wrapper, [shared.gradio['output_textbox']] + shared.input_params[1:], output_params, show_progress=False).then(
-                    ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                    lambda: None, None, None, _js=f"() => {{{audio_notification_js}}}")
-                    # lambda: None, None, None, _js="() => {element = document.getElementsByTagName('textarea')[1]; element.scrollTop = element.scrollHeight}")
-                )
-
-            shared.gradio['Stop'].click(stop_everything_event, None, None, queue=False, cancels=gen_events if shared.args.no_stream else None)
-            shared.gradio['prompt_menu'].change(load_prompt, gradio('prompt_menu'), gradio('textbox'), show_progress=False)
-            shared.gradio['save_prompt'].click(
-                lambda x: x, gradio('textbox'), gradio('save_contents')).then(
-                lambda: 'prompts/', None, gradio('save_root')).then(
-                lambda: utils.current_time() + '.txt', None, gradio('save_filename')).then(
-                lambda: gr.update(visible=True), None, gradio('file_saver'))
-
-            shared.gradio['delete_prompt'].click(
-                lambda: 'prompts/', None, gradio('delete_root')).then(
-                lambda x: x + '.txt', gradio('prompt_menu'), gradio('delete_filename')).then(
-                lambda: gr.update(visible=True), None, gradio('file_deleter'))
-
-            shared.gradio['count_tokens'].click(count_tokens, gradio('textbox'), gradio('status'), show_progress=False)
-
-        create_file_saving_event_handlers()
+        # Other events
+        ui_file_saving.create_event_handlers()
+        ui_parameters.create_event_handlers()
+        ui_model_menu.create_event_handlers()
 
+        # Interface launch events
         if shared.settings['dark_theme']:
             shared.gradio['interface'].load(lambda: None, None, None, _js="() => document.getElementsByTagName('body')[0].classList.add('dark')")
 
         shared.gradio['interface'].load(lambda: None, None, None, _js=f"() => {{{js}}}")
+        shared.gradio['interface'].load(None, gradio('show_controls'), None, _js=f'(x) => {{{ui.show_controls_js}; toggle_controls(x)}}')
         shared.gradio['interface'].load(partial(ui.apply_interface_values, {}, use_persistent=True), None, gradio(ui.list_interface_input_elements()), show_progress=False)
-        if shared.is_chat():
-            shared.gradio['interface'].load(chat.redraw_html, shared.reload_inputs, gradio('display'))
+        shared.gradio['interface'].load(chat.redraw_html, gradio(ui_chat.reload_arr), gradio('display'))
 
-        # Extensions tabs
-        extensions_module.create_extensions_tabs()
-
-        # Extensions block
-        extensions_module.create_extensions_block()
+        extensions_module.create_extensions_tabs()  # Extensions tabs
+        extensions_module.create_extensions_block()  # Extensions block
 
     # Launch the interface
-    shared.gradio['interface'].queue()
+    shared.gradio['interface'].queue(concurrency_count=64)
     with OpenMonkeyPatch():
-        if shared.args.listen:
-            shared.gradio['interface'].launch(prevent_thread_lock=True, share=shared.args.share, server_name=shared.args.listen_host or '0.0.0.0', server_port=shared.args.listen_port, inbrowser=shared.args.auto_launch, auth=auth)
-        else:
-            shared.gradio['interface'].launch(prevent_thread_lock=True, share=shared.args.share, server_port=shared.args.listen_port, inbrowser=shared.args.auto_launch, auth=auth)
+        shared.gradio['interface'].launch(
+            prevent_thread_lock=True,
+            share=shared.args.share,
+            server_name=None if not shared.args.listen else (shared.args.listen_host or '0.0.0.0'),
+            server_port=shared.args.listen_port,
+            inbrowser=shared.args.auto_launch,
+            auth=auth or None,
+            ssl_verify=False if (shared.args.ssl_keyfile or shared.args.ssl_certfile) else True,
+            ssl_keyfile=shared.args.ssl_keyfile,
+            ssl_certfile=shared.args.ssl_certfile
+        )
 
 
 if __name__ == "__main__":
-    # Loading custom settings
+
+    # Load custom settings
     settings_file = None
     if shared.args.settings is not None and Path(shared.args.settings).exists():
         settings_file = Path(shared.args.settings)
@@ -1103,10 +166,9 @@ if __name__ == "__main__":
         logger.info(f"Loading settings from {settings_file}...")
         file_contents = open(settings_file, 'r', encoding='utf-8').read()
         new_settings = json.loads(file_contents) if settings_file.suffix == "json" else yaml.safe_load(file_contents)
-        for item in new_settings:
-            shared.settings[item] = new_settings[item]
+        shared.settings.update(new_settings)
 
-    # Set default model settings based on settings file
+    # Fallback settings for models
     shared.model_config['.*'] = {
         'wbits': 'None',
         'model_type': 'None',
@@ -1118,22 +180,17 @@ if __name__ == "__main__":
         'truncation_length': shared.settings['truncation_length'],
         'n_gqa': 0,
         'rms_norm_eps': 0,
+        'rope_freq_base': 0,
     }
 
     shared.model_config.move_to_end('.*', last=False)  # Move to the beginning
 
-    # Default extensions
+    # Activate the extensions listed on settings.yaml
     extensions_module.available_extensions = utils.get_available_extensions()
-    if shared.is_chat():
-        for extension in shared.settings['chat_default_extensions']:
-            shared.args.extensions = shared.args.extensions or []
-            if extension not in shared.args.extensions:
-                shared.args.extensions.append(extension)
-    else:
-        for extension in shared.settings['default_extensions']:
-            shared.args.extensions = shared.args.extensions or []
-            if extension not in shared.args.extensions:
-                shared.args.extensions.append(extension)
+    for extension in shared.settings['default_extensions']:
+        shared.args.extensions = shared.args.extensions or []
+        if extension not in shared.args.extensions:
+            shared.args.extensions.append(extension)
 
     available_models = utils.get_available_models()
 
diff --git a/settings-template.yaml b/settings-template.yaml
index 3d6585d3..b2526df1 100644
--- a/settings-template.yaml
+++ b/settings-template.yaml
@@ -1,36 +1,35 @@
 dark_theme: true
-autoload_model: false
+show_controls: true
+start_with: ''
+mode: chat
+chat_style: TheEncrypted777
+character: None
+prompt-default: QA
+prompt-notebook: QA
+preset: simple-1
 max_new_tokens: 200
 max_new_tokens_min: 1
 max_new_tokens_max: 4096
 seed: -1
-character: None
+negative_prompt: ''
+truncation_length: 2048
+truncation_length_min: 0
+truncation_length_max: 16384
+custom_stopping_strings: ''
+auto_max_new_tokens: false
+ban_eos_token: false
+add_bos_token: true
+skip_special_tokens: true
+stream: true
 name1: You
 name2: Assistant
 context: This is a conversation with your Assistant. It is a computer program designed to help you with various tasks such as answering questions, providing recommendations, and helping with decision making. You can ask it anything you want and it will do its best to give you accurate and relevant information.
 greeting: ''
-turn_template: ''
-custom_stopping_strings: ''
-stop_at_newline: false
-add_bos_token: true
-ban_eos_token: false
-skip_special_tokens: true
-truncation_length: 2048
-truncation_length_min: 0
-truncation_length_max: 16384
-mode: chat
-start_with: ''
-chat_style: TheEncrypted777
 instruction_template: None
 chat-instruct_command: |-
   Continue the chat dialogue below. Write a single reply for the character "<|character|>".
 
   <|prompt|>
-chat_generation_attempts: 1
-chat_generation_attempts_min: 1
-chat_generation_attempts_max: 10
-default_extensions: []
-chat_default_extensions:
+autoload_model: false
+default_extensions:
 - gallery
-preset: simple-1
-prompt: QA