From 345b6dee8c65b0979812a9051864f9ae0e87d25c Mon Sep 17 00:00:00 2001
From: Ayanami Rei <wennadocta@protonmail.com>
Date: Mon, 13 Mar 2023 19:59:57 +0300
Subject: [PATCH 01/45] refactor quant models loader and add support of OPT

---
 .../{quantized_LLaMA.py => quant_loader.py}   | 26 +++++++------------
 1 file changed, 9 insertions(+), 17 deletions(-)
 rename modules/{quantized_LLaMA.py => quant_loader.py} (61%)

diff --git a/modules/quantized_LLaMA.py b/modules/quant_loader.py
similarity index 61%
rename from modules/quantized_LLaMA.py
rename to modules/quant_loader.py
index e9352f90..8bf505a6 100644
--- a/modules/quantized_LLaMA.py
+++ b/modules/quant_loader.py
@@ -7,28 +7,20 @@ import torch
 import modules.shared as shared
 
 sys.path.insert(0, str(Path("repositories/GPTQ-for-LLaMa")))
-from llama import load_quant
 
 
 # 4-bit LLaMA
-def load_quantized_LLaMA(model_name):
-    if shared.args.load_in_4bit:
-        bits = 4
+def load_quant(model_name, model_type):
+    if model_type == 'llama':
+        from llama import load_quant
+    elif model_type == 'opt':
+        from opt import load_quant
     else:
-        bits = shared.args.gptq_bits
+        print("Unknown pre-quantized model type specified. Only 'llama' and 'opt' are supported")
+        exit()
 
     path_to_model = Path(f'models/{model_name}')
-    pt_model = ''
-    if path_to_model.name.lower().startswith('llama-7b'):
-        pt_model = f'llama-7b-{bits}bit.pt'
-    elif path_to_model.name.lower().startswith('llama-13b'):
-        pt_model = f'llama-13b-{bits}bit.pt'
-    elif path_to_model.name.lower().startswith('llama-30b'):
-        pt_model = f'llama-30b-{bits}bit.pt'
-    elif path_to_model.name.lower().startswith('llama-65b'):
-        pt_model = f'llama-65b-{bits}bit.pt'
-    else:
-        pt_model = f'{model_name}-{bits}bit.pt'
+    pt_model = f'{model_name}-{shared.args.gptq_bits}bit.pt'
 
     # Try to find the .pt both in models/ and in the subfolder
     pt_path = None
@@ -40,7 +32,7 @@ def load_quantized_LLaMA(model_name):
         print(f"Could not find {pt_model}, exiting...")
         exit()
 
-    model = load_quant(path_to_model, str(pt_path), bits)
+    model = load_quant(path_to_model, str(pt_path), shared.args.gptq_bits)
 
     # Multiple GPUs or GPU+CPU
     if shared.args.gpu_memory:

From edbc61139ff5a0ccb2c41a3d8446b231fd31ac5e Mon Sep 17 00:00:00 2001
From: Ayanami Rei <wennadocta@protonmail.com>
Date: Mon, 13 Mar 2023 20:00:38 +0300
Subject: [PATCH 02/45] use new quant loader

---
 modules/models.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/modules/models.py b/modules/models.py
index 7d094ed5..31696795 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -1,6 +1,5 @@
 import json
 import os
-import sys
 import time
 import zipfile
 from pathlib import Path
@@ -35,6 +34,7 @@ if shared.args.deepspeed:
     ds_config = generate_ds_config(shared.args.bf16, 1 * world_size, shared.args.nvme_offload_dir)
     dschf = HfDeepSpeedConfig(ds_config) # Keep this object alive for the Transformers integration
 
+
 def load_model(model_name):
     print(f"Loading {model_name}...")
     t0 = time.time()
@@ -42,7 +42,7 @@ def load_model(model_name):
     shared.is_RWKV = model_name.lower().startswith('rwkv-')
 
     # Default settings
-    if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.load_in_4bit, shared.args.gptq_bits > 0, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.is_RWKV]):
+    if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.gptq_bits, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.is_RWKV]):
         if any(size in shared.model_name.lower() for size in ('13b', '20b', '30b')):
             model = AutoModelForCausalLM.from_pretrained(Path(f"models/{shared.model_name}"), device_map='auto', load_in_8bit=True)
         else:
@@ -87,11 +87,11 @@ def load_model(model_name):
 
         return model, tokenizer
 
-    # 4-bit LLaMA
-    elif shared.args.gptq_bits > 0 or shared.args.load_in_4bit:
-        from modules.quantized_LLaMA import load_quantized_LLaMA
+    # Quantized model
+    elif shared.args.gptq_bits > 0:
+        from modules.quant_loader import load_quant
 
-        model = load_quantized_LLaMA(model_name)
+        model = load_quant(model_name, shared.args.gptq_model_type)
 
     # Custom
     else:

From 1b99ed61bc834a76b1d436fe6e7ad411a46c8385 Mon Sep 17 00:00:00 2001
From: Ayanami Rei <wennadocta@protonmail.com>
Date: Mon, 13 Mar 2023 20:01:34 +0300
Subject: [PATCH 03/45] add argument --gptq-model-type and remove duplicate
 arguments

---
 modules/shared.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/shared.py b/modules/shared.py
index 5f6c01f3..c74117ab 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -68,8 +68,8 @@ parser.add_argument('--chat', action='store_true', help='Launch the web UI in ch
 parser.add_argument('--cai-chat', action='store_true', help='Launch the web UI in chat mode with a style similar to Character.AI\'s. If the file img_bot.png or img_bot.jpg exists in the same folder as server.py, this image will be used as the bot\'s profile picture. Similarly, img_me.png or img_me.jpg will be used as your profile picture.')
 parser.add_argument('--cpu', action='store_true', help='Use the CPU to generate text.')
 parser.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision.')
-parser.add_argument('--load-in-4bit', action='store_true', help='Load the model with 4-bit precision. Currently only works with LLaMA.')
-parser.add_argument('--gptq-bits', type=int, default=0, help='Load a pre-quantized model with specified precision. 2, 3, 4 and 8bit are supported. Currently only works with LLaMA.')
+parser.add_argument('--gptq-bits', type=int, default=0, help='Load a pre-quantized model with specified precision. 2, 3, 4 and 8bit are supported. Currently only works with LLaMA and OPT.')
+parser.add_argument('--gptq-model-type', type=str, default='llama', help='Model type of pre-quantized model. Currently only LLaMa and OPT are supported.')
 parser.add_argument('--bf16', action='store_true', help='Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.')
 parser.add_argument('--auto-devices', action='store_true', help='Automatically split the model across the available GPU(s) and CPU.')
 parser.add_argument('--disk', action='store_true', help='If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.')

From 3c9afd5ca32d082eb29cab2eb708a0247244195e Mon Sep 17 00:00:00 2001
From: Ayanami Rei <wennadocta@protonmail.com>
Date: Mon, 13 Mar 2023 20:14:40 +0300
Subject: [PATCH 04/45] rename method

---
 modules/models.py       | 4 ++--
 modules/quant_loader.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/models.py b/modules/models.py
index 31696795..e4dce127 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -89,9 +89,9 @@ def load_model(model_name):
 
     # Quantized model
     elif shared.args.gptq_bits > 0:
-        from modules.quant_loader import load_quant
+        from modules.quant_loader import load_quantized
 
-        model = load_quant(model_name, shared.args.gptq_model_type)
+        model = load_quantized(model_name, shared.args.gptq_model_type)
 
     # Custom
     else:
diff --git a/modules/quant_loader.py b/modules/quant_loader.py
index 8bf505a6..a2b484b0 100644
--- a/modules/quant_loader.py
+++ b/modules/quant_loader.py
@@ -10,7 +10,7 @@ sys.path.insert(0, str(Path("repositories/GPTQ-for-LLaMa")))
 
 
 # 4-bit LLaMA
-def load_quant(model_name, model_type):
+def load_quantized(model_name, model_type):
     if model_type == 'llama':
         from llama import load_quant
     elif model_type == 'opt':

From b746250b2f3dd543f354c0b8b7db26e607be489a Mon Sep 17 00:00:00 2001
From: Ayanami Rei <wennadocta@protonmail.com>
Date: Mon, 13 Mar 2023 20:18:56 +0300
Subject: [PATCH 05/45] Update README

---
 README.md | 66 +++++++++++++++++++++++++++----------------------------
 1 file changed, 33 insertions(+), 33 deletions(-)

diff --git a/README.md b/README.md
index 89b567f2..16e07592 100644
--- a/README.md
+++ b/README.md
@@ -132,39 +132,39 @@ Then browse to
 
 Optionally, you can use the following command-line flags:
 
-| Flag        | Description |
-|-------------|-------------|
-| `-h`, `--help`  | show this help message and exit |
-| `--model MODEL`    | Name of the model to load by default. |
-| `--notebook`  | Launch the web UI in notebook mode, where the output is written to the same text box as the input. |
-| `--chat`      | Launch the web UI in chat mode.|
-| `--cai-chat`  | Launch the web UI in chat mode with a style similar to Character.AI's. If the file `img_bot.png` or `img_bot.jpg` exists in the same folder as server.py, this image will be used as the bot's profile picture. Similarly, `img_me.png` or `img_me.jpg` will be used as your profile picture. |
-| `--cpu`       | Use the CPU to generate text.|
-| `--load-in-8bit`  | Load the model with 8-bit precision.|
-| `--load-in-4bit`  | Load the model with 4-bit precision. Currently only works with LLaMA.|
-| `--gptq-bits GPTQ_BITS`  |  Load a pre-quantized model with specified precision. 2, 3, 4 and 8 (bit) are supported. Currently only works with LLaMA. |
-| `--bf16`  | Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU. |
-| `--auto-devices` | Automatically split the model across the available GPU(s) and CPU.|
-| `--disk` | If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk. |
-| `--disk-cache-dir DISK_CACHE_DIR` | Directory to save the disk cache to. Defaults to `cache/`. |
-|  `--gpu-memory GPU_MEMORY [GPU_MEMORY ...]` |  Maxmimum GPU memory in GiB to be allocated per GPU. Example: `--gpu-memory 10` for a single GPU, `--gpu-memory 10 5` for two GPUs. |
-| `--cpu-memory CPU_MEMORY`    | Maximum CPU memory in GiB to allocate for offloaded weights. Must be an integer number. Defaults to 99.|
-| `--flexgen`                   |         Enable the use of FlexGen offloading. |
-|  `--percent PERCENT [PERCENT ...]`    |  FlexGen: allocation percentages. Must be 6 numbers separated by spaces (default: 0, 100, 100, 0, 100, 0). |
-|  `--compress-weight`                  |  FlexGen: Whether to compress weight (default: False).|
-|  `--pin-weight [PIN_WEIGHT]`          |       FlexGen: whether to pin weights (setting this to False reduces CPU memory by 20%). |
-| `--deepspeed`    | Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration. |
-| `--nvme-offload-dir NVME_OFFLOAD_DIR`    | DeepSpeed: Directory to use for ZeRO-3 NVME offloading. |
-| `--local_rank LOCAL_RANK`    | DeepSpeed: Optional argument for distributed setups. |
-|  `--rwkv-strategy RWKV_STRATEGY`         |    RWKV: The strategy to use while loading the model. Examples: "cpu fp32", "cuda fp16", "cuda fp16i8". |
-|  `--rwkv-cuda-on`                        |   RWKV: Compile the CUDA kernel for better performance. |
-| `--no-stream`   | Don't stream the text output in real time. This improves the text generation performance.|
-| `--settings SETTINGS_FILE` | Load the default interface settings from this json file. See `settings-template.json` for an example. If you create a file called `settings.json`, this file will be loaded by default without the need to use the `--settings` flag.|
-|  `--extensions EXTENSIONS [EXTENSIONS ...]` |  The list of extensions to load. If you want to load more than one extension, write the names separated by spaces. |
-| `--listen`   | Make the web UI reachable from your local network.|
-|  `--listen-port LISTEN_PORT` | The listening port that the server will use. |
-| `--share`   | Create a public URL. This is useful for running the web UI on Google Colab or similar. |
-| `--verbose`   | Print the prompts to the terminal. |
+| Flag                                       | Description                                                                                                                                                                                                                                                                                   |
+|--------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `-h`, `--help`                             | show this help message and exit                                                                                                                                                                                                                                                               |
+| `--model MODEL`                            | Name of the model to load by default.                                                                                                                                                                                                                                                         |
+| `--notebook`                               | Launch the web UI in notebook mode, where the output is written to the same text box as the input.                                                                                                                                                                                            |
+| `--chat`                                   | Launch the web UI in chat mode.                                                                                                                                                                                                                                                               |
+| `--cai-chat`                               | Launch the web UI in chat mode with a style similar to Character.AI's. If the file `img_bot.png` or `img_bot.jpg` exists in the same folder as server.py, this image will be used as the bot's profile picture. Similarly, `img_me.png` or `img_me.jpg` will be used as your profile picture. |
+| `--cpu`                                    | Use the CPU to generate text.                                                                                                                                                                                                                                                                 |
+| `--load-in-8bit`                           | Load the model with 8-bit precision.                                                                                                                                                                                                                                                          |
+| `--gptq-bits GPTQ_BITS`                    | Load a pre-quantized model with specified precision. 2, 3, 4 and 8 (bit) are supported. Currently only works with LLaMA and OPT.                                                                                                                                                              |
+| `--gptq-model-type MODEL_TYPE`             | Model type of pre-quantized model. Currently only LLaMa and OPT are supported.                                                                                                                                                                                                                |
+| `--bf16`                                   | Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.                                                                                                                                                                                                                           |
+| `--auto-devices`                           | Automatically split the model across the available GPU(s) and CPU.                                                                                                                                                                                                                            |
+| `--disk`                                   | If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.                                                                                                                                                                                            |
+| `--disk-cache-dir DISK_CACHE_DIR`          | Directory to save the disk cache to. Defaults to `cache/`.                                                                                                                                                                                                                                    |
+| `--gpu-memory GPU_MEMORY [GPU_MEMORY ...]` | Maxmimum GPU memory in GiB to be allocated per GPU. Example: `--gpu-memory 10` for a single GPU, `--gpu-memory 10 5` for two GPUs.                                                                                                                                                            |
+| `--cpu-memory CPU_MEMORY`                  | Maximum CPU memory in GiB to allocate for offloaded weights. Must be an integer number. Defaults to 99.                                                                                                                                                                                       |
+| `--flexgen`                                | Enable the use of FlexGen offloading.                                                                                                                                                                                                                                                         |
+| `--percent PERCENT [PERCENT ...]`          | FlexGen: allocation percentages. Must be 6 numbers separated by spaces (default: 0, 100, 100, 0, 100, 0).                                                                                                                                                                                     |
+| `--compress-weight`                        | FlexGen: Whether to compress weight (default: False).                                                                                                                                                                                                                                         |
+| `--pin-weight [PIN_WEIGHT]`                | FlexGen: whether to pin weights (setting this to False reduces CPU memory by 20%).                                                                                                                                                                                                            |
+| `--deepspeed`                              | Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.                                                                                                                                                                                                            |
+| `--nvme-offload-dir NVME_OFFLOAD_DIR`      | DeepSpeed: Directory to use for ZeRO-3 NVME offloading.                                                                                                                                                                                                                                       |
+| `--local_rank LOCAL_RANK`                  | DeepSpeed: Optional argument for distributed setups.                                                                                                                                                                                                                                          |
+| `--rwkv-strategy RWKV_STRATEGY`            | RWKV: The strategy to use while loading the model. Examples: "cpu fp32", "cuda fp16", "cuda fp16i8".                                                                                                                                                                                          |
+| `--rwkv-cuda-on`                           | RWKV: Compile the CUDA kernel for better performance.                                                                                                                                                                                                                                         |
+| `--no-stream`                              | Don't stream the text output in real time. This improves the text generation performance.                                                                                                                                                                                                     |
+| `--settings SETTINGS_FILE`                 | Load the default interface settings from this json file. See `settings-template.json` for an example. If you create a file called `settings.json`, this file will be loaded by default without the need to use the `--settings` flag.                                                         |
+| `--extensions EXTENSIONS [EXTENSIONS ...]` | The list of extensions to load. If you want to load more than one extension, write the names separated by spaces.                                                                                                                                                                             |
+| `--listen`                                 | Make the web UI reachable from your local network.                                                                                                                                                                                                                                            |
+| `--listen-port LISTEN_PORT`                | The listening port that the server will use.                                                                                                                                                                                                                                                  |
+| `--share`                                  | Create a public URL. This is useful for running the web UI on Google Colab or similar.                                                                                                                                                                                                        |
+| `--verbose`                                | Print the prompts to the terminal.                                                                                                                                                                                                                                                            |
 
 Out of memory errors? [Check this guide](https://github.com/oobabooga/text-generation-webui/wiki/Low-VRAM-guide).
 

From e1c952c41ce7ce93dde13dc77860b55d80d59be4 Mon Sep 17 00:00:00 2001
From: Ayanami Rei <wennadocta@protonmail.com>
Date: Mon, 13 Mar 2023 20:22:38 +0300
Subject: [PATCH 06/45] make argument non case-sensitive

---
 modules/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/models.py b/modules/models.py
index e4dce127..9bcaca9e 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -91,7 +91,7 @@ def load_model(model_name):
     elif shared.args.gptq_bits > 0:
         from modules.quant_loader import load_quantized
 
-        model = load_quantized(model_name, shared.args.gptq_model_type)
+        model = load_quantized(model_name, shared.args.gptq_model_type.lower())
 
     # Custom
     else:

From b6c5c57f2ec5e3aa13f51363091db4dcbbc685ef Mon Sep 17 00:00:00 2001
From: Ayanami Rei <wennadocta@protonmail.com>
Date: Mon, 13 Mar 2023 22:11:08 +0300
Subject: [PATCH 07/45] remove default value from argument

---
 modules/shared.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/shared.py b/modules/shared.py
index c74117ab..aeb82806 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -69,7 +69,7 @@ parser.add_argument('--cai-chat', action='store_true', help='Launch the web UI i
 parser.add_argument('--cpu', action='store_true', help='Use the CPU to generate text.')
 parser.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision.')
 parser.add_argument('--gptq-bits', type=int, default=0, help='Load a pre-quantized model with specified precision. 2, 3, 4 and 8bit are supported. Currently only works with LLaMA and OPT.')
-parser.add_argument('--gptq-model-type', type=str, default='llama', help='Model type of pre-quantized model. Currently only LLaMa and OPT are supported.')
+parser.add_argument('--gptq-model-type', type=str, help='Model type of pre-quantized model. Currently only LLaMa and OPT are supported.')
 parser.add_argument('--bf16', action='store_true', help='Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.')
 parser.add_argument('--auto-devices', action='store_true', help='Automatically split the model across the available GPU(s) and CPU.')
 parser.add_argument('--disk', action='store_true', help='If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.')

From a6a6522b6a910c0bf6faa76191f89543db43eadf Mon Sep 17 00:00:00 2001
From: Ayanami Rei <wennadocta@protonmail.com>
Date: Mon, 13 Mar 2023 22:11:32 +0300
Subject: [PATCH 08/45] determine model type from model name

---
 modules/quant_loader.py | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/modules/quant_loader.py b/modules/quant_loader.py
index a2b484b0..7a5f8461 100644
--- a/modules/quant_loader.py
+++ b/modules/quant_loader.py
@@ -9,8 +9,17 @@ import modules.shared as shared
 sys.path.insert(0, str(Path("repositories/GPTQ-for-LLaMa")))
 
 
-# 4-bit LLaMA
-def load_quantized(model_name, model_type):
+def load_quantized(model_name):
+    if not shared.args.gptq_model_type:
+        # Try to determine model type from model name
+        model_type = model_name.split('-')[0].lower()
+        if model_type not in ('llama', 'opt'):
+            print("Can't determine model type from model name. Please specify it manually using --gptq-model-type "
+                  "argument")
+            exit()
+    else:
+        model_type = shared.args.gptq_model_type.lower()
+
     if model_type == 'llama':
         from llama import load_quant
     elif model_type == 'opt':
@@ -20,7 +29,16 @@ def load_quantized(model_name, model_type):
         exit()
 
     path_to_model = Path(f'models/{model_name}')
-    pt_model = f'{model_name}-{shared.args.gptq_bits}bit.pt'
+    if path_to_model.name.lower().startswith('llama-7b'):
+        pt_model = f'llama-7b-{shared.args.gptq_bits}bit.pt'
+    elif path_to_model.name.lower().startswith('llama-13b'):
+        pt_model = f'llama-13b-{shared.args.gptq_bits}bit.pt'
+    elif path_to_model.name.lower().startswith('llama-30b'):
+        pt_model = f'llama-30b-{shared.args.gptq_bits}bit.pt'
+    elif path_to_model.name.lower().startswith('llama-65b'):
+        pt_model = f'llama-65b-{shared.args.gptq_bits}bit.pt'
+    else:
+        pt_model = f'{model_name}-{shared.args.gptq_bits}bit.pt'
 
     # Try to find the .pt both in models/ and in the subfolder
     pt_path = None

From 8778b756e69144ff91180076068eeb9bcd915a60 Mon Sep 17 00:00:00 2001
From: Ayanami Rei <wennadocta@protonmail.com>
Date: Mon, 13 Mar 2023 22:11:40 +0300
Subject: [PATCH 09/45] use updated load_quantized

---
 modules/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/models.py b/modules/models.py
index 9bcaca9e..46cd77ff 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -91,7 +91,7 @@ def load_model(model_name):
     elif shared.args.gptq_bits > 0:
         from modules.quant_loader import load_quantized
 
-        model = load_quantized(model_name, shared.args.gptq_model_type.lower())
+        model = load_quantized(model_name)
 
     # Custom
     else:

From 518e5c4244b1d373d616ab32215b2f1c195deae8 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 13 Mar 2023 16:45:08 -0300
Subject: [PATCH 10/45] Some minor fixes to the GPTQ loader

---
 modules/quant_loader.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/modules/quant_loader.py b/modules/quant_loader.py
index 7a5f8461..c2723490 100644
--- a/modules/quant_loader.py
+++ b/modules/quant_loader.py
@@ -7,6 +7,8 @@ import torch
 import modules.shared as shared
 
 sys.path.insert(0, str(Path("repositories/GPTQ-for-LLaMa")))
+import llama
+import opt
 
 
 def load_quantized(model_name):
@@ -21,9 +23,9 @@ def load_quantized(model_name):
         model_type = shared.args.gptq_model_type.lower()
 
     if model_type == 'llama':
-        from llama import load_quant
+        load_quant = llama.load_quant
     elif model_type == 'opt':
-        from opt import load_quant
+        load_quant = opt.load_quant
     else:
         print("Unknown pre-quantized model type specified. Only 'llama' and 'opt' are supported")
         exit()
@@ -50,7 +52,7 @@ def load_quantized(model_name):
         print(f"Could not find {pt_model}, exiting...")
         exit()
 
-    model = load_quant(path_to_model, str(pt_path), shared.args.gptq_bits)
+    model = load_quant(str(path_to_model), str(pt_path), shared.args.gptq_bits)
 
     # Multiple GPUs or GPU+CPU
     if shared.args.gpu_memory:

From 0a755847069fa1494c37c7bb27dcdc9115b0372f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 13 Mar 2023 18:07:08 -0300
Subject: [PATCH 11/45] Create issue templates

---
 .github/ISSUE_TEMPLATE/bug_report.md      | 38 +++++++++++++++++++++++
 .github/ISSUE_TEMPLATE/feature_request.md | 20 ++++++++++++
 2 files changed, 58 insertions(+)
 create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md
 create mode 100644 .github/ISSUE_TEMPLATE/feature_request.md

diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 00000000..dd84ea78
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,38 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**To Reproduce**
+Steps to reproduce the behavior:
+1. Go to '...'
+2. Click on '....'
+3. Scroll down to '....'
+4. See error
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Screenshots**
+If applicable, add screenshots to help explain your problem.
+
+**Desktop (please complete the following information):**
+ - OS: [e.g. iOS]
+ - Browser [e.g. chrome, safari]
+ - Version [e.g. 22]
+
+**Smartphone (please complete the following information):**
+ - Device: [e.g. iPhone6]
+ - OS: [e.g. iOS8.1]
+ - Browser [e.g. stock browser, safari]
+ - Version [e.g. 22]
+
+**Additional context**
+Add any other context about the problem here.
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 00000000..bbcbbe7d
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,20 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.

From 69d4b818b7d8cdee9357ea834f96e324206eb03c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 13 Mar 2023 18:09:37 -0300
Subject: [PATCH 12/45] Create bug_report_template.yml

---
 .../ISSUE_TEMPLATE/bug_report_template.yml    | 65 +++++++++++++++++++
 1 file changed, 65 insertions(+)
 create mode 100644 .github/ISSUE_TEMPLATE/bug_report_template.yml

diff --git a/.github/ISSUE_TEMPLATE/bug_report_template.yml b/.github/ISSUE_TEMPLATE/bug_report_template.yml
new file mode 100644
index 00000000..04924a40
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report_template.yml
@@ -0,0 +1,65 @@
+name: "\U0001F41E Bug report"
+description: Report a bug on Gradio
+labels: [ "bug" ]
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Thanks for taking the time to fill out this bug report!
+  - type: textarea
+    id: bug-description
+    attributes:
+      label: Describe the bug
+      description: A clear and concise description of what the bug is. If you intend to submit a PR for this issue, tell us in the description. Thanks!
+      placeholder: Bug description
+    validations:
+      required: true
+  - type: checkboxes
+    attributes:
+      label: Is there an existing issue for this?
+      description: Please search to see if an issue already exists for the issue you encountered.
+      options:
+        - label: I have searched the existing issues
+          required: true
+  - type: textarea
+    id: reproduction
+    attributes:
+      label: Reproduction
+      description: Please provide a link to a repo or REPL that can reproduce the problem you ran into. Or provide the Python code below that can be run to reproduce the issue. 
+      placeholder: Reproduction
+    validations:
+      required: true
+  - type: textarea
+    id: screenshot
+    attributes:
+      label: Screenshot
+      description: "If possible, please include screenshot(s) so that we can understand what the issue is."
+  - type: textarea
+    id: logs
+    attributes:
+      label: Logs
+      description: "Please include the full stacktrace of the errors you get from Python or Javascript. If you are running in a colab notebooks, you can get the logs with by setting `debug=True`, i.e: `gradio.Interface.launch(debug=True)`"
+      render: shell
+    validations:
+      required: true
+  - type: textarea
+    id: system-info
+    attributes:
+      label: System Info
+      description: Please share your system info with us, you can get the Gradio version with `gradio.__version__`
+      render: shell
+      placeholder: Gradio version, Operating System, Browser
+    validations:
+      required: true
+  - type: dropdown
+    id: severity
+    attributes:
+      label: Severity
+      description: Select the severity of this issue
+      options:
+        - annoying
+        - serious, but I can work around it
+        - blocking upgrade to latest gradio version
+        - blocking all usage of gradio
+    validations:
+      required: true

From 7dbc071e5acf7032ebfee663180d89df7c9eedc5 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 13 Mar 2023 18:09:58 -0300
Subject: [PATCH 13/45] Delete bug_report.md

---
 .github/ISSUE_TEMPLATE/bug_report.md | 38 ----------------------------
 1 file changed, 38 deletions(-)
 delete mode 100644 .github/ISSUE_TEMPLATE/bug_report.md

diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
deleted file mode 100644
index dd84ea78..00000000
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ /dev/null
@@ -1,38 +0,0 @@
----
-name: Bug report
-about: Create a report to help us improve
-title: ''
-labels: ''
-assignees: ''
-
----
-
-**Describe the bug**
-A clear and concise description of what the bug is.
-
-**To Reproduce**
-Steps to reproduce the behavior:
-1. Go to '...'
-2. Click on '....'
-3. Scroll down to '....'
-4. See error
-
-**Expected behavior**
-A clear and concise description of what you expected to happen.
-
-**Screenshots**
-If applicable, add screenshots to help explain your problem.
-
-**Desktop (please complete the following information):**
- - OS: [e.g. iOS]
- - Browser [e.g. chrome, safari]
- - Version [e.g. 22]
-
-**Smartphone (please complete the following information):**
- - Device: [e.g. iPhone6]
- - OS: [e.g. iOS8.1]
- - Browser [e.g. stock browser, safari]
- - Version [e.g. 22]
-
-**Additional context**
-Add any other context about the problem here.

From aee3b53fb37f7f2fb1e751e8ab2b61e85dfcc1b2 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 13 Mar 2023 18:14:31 -0300
Subject: [PATCH 14/45] Update bug_report_template.yml

---
 .../ISSUE_TEMPLATE/bug_report_template.yml    | 26 +++++--------------
 1 file changed, 7 insertions(+), 19 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/bug_report_template.yml b/.github/ISSUE_TEMPLATE/bug_report_template.yml
index 04924a40..49fbe984 100644
--- a/.github/ISSUE_TEMPLATE/bug_report_template.yml
+++ b/.github/ISSUE_TEMPLATE/bug_report_template.yml
@@ -1,5 +1,5 @@
-name: "\U0001F41E Bug report"
-description: Report a bug on Gradio
+name: "Bug report"
+description: Report a bug
 labels: [ "bug" ]
 body:
   - type: markdown
@@ -10,7 +10,7 @@ body:
     id: bug-description
     attributes:
       label: Describe the bug
-      description: A clear and concise description of what the bug is. If you intend to submit a PR for this issue, tell us in the description. Thanks!
+      description: A clear and concise description of what the bug is.
       placeholder: Bug description
     validations:
       required: true
@@ -25,7 +25,7 @@ body:
     id: reproduction
     attributes:
       label: Reproduction
-      description: Please provide a link to a repo or REPL that can reproduce the problem you ran into. Or provide the Python code below that can be run to reproduce the issue. 
+      description: Please provide the steps necessary to reproduce your issue, including the command-line flags that were used (like `--chat`, `--gpu-memory`, etc).
       placeholder: Reproduction
     validations:
       required: true
@@ -38,7 +38,7 @@ body:
     id: logs
     attributes:
       label: Logs
-      description: "Please include the full stacktrace of the errors you get from Python or Javascript. If you are running in a colab notebooks, you can get the logs with by setting `debug=True`, i.e: `gradio.Interface.launch(debug=True)`"
+      description: "Please include the full stacktrace of the errors you get in the command-line (if any)."
       render: shell
     validations:
       required: true
@@ -46,20 +46,8 @@ body:
     id: system-info
     attributes:
       label: System Info
-      description: Please share your system info with us, you can get the Gradio version with `gradio.__version__`
+      description: Please share your system info with us: operating system and GPU brand/model.
       render: shell
-      placeholder: Gradio version, Operating System, Browser
-    validations:
-      required: true
-  - type: dropdown
-    id: severity
-    attributes:
-      label: Severity
-      description: Select the severity of this issue
-      options:
-        - annoying
-        - serious, but I can work around it
-        - blocking upgrade to latest gradio version
-        - blocking all usage of gradio
+      placeholder: 
     validations:
       required: true

From ed30bd3216d41b6a4f6694e4f440fbceaf46dff4 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 13 Mar 2023 18:14:54 -0300
Subject: [PATCH 15/45] Update bug_report_template.yml

---
 .github/ISSUE_TEMPLATE/bug_report_template.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ISSUE_TEMPLATE/bug_report_template.yml b/.github/ISSUE_TEMPLATE/bug_report_template.yml
index 49fbe984..19275180 100644
--- a/.github/ISSUE_TEMPLATE/bug_report_template.yml
+++ b/.github/ISSUE_TEMPLATE/bug_report_template.yml
@@ -46,7 +46,7 @@ body:
     id: system-info
     attributes:
       label: System Info
-      description: Please share your system info with us: operating system and GPU brand/model.
+      description: "Please share your system info with us: operating system and GPU brand/model."
       render: shell
       placeholder: 
     validations:

From bcfb7d752ae661da80aab0e1ed986211369b2ba1 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 13 Mar 2023 18:16:18 -0300
Subject: [PATCH 16/45] Update bug_report_template.yml

---
 .github/ISSUE_TEMPLATE/bug_report_template.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ISSUE_TEMPLATE/bug_report_template.yml b/.github/ISSUE_TEMPLATE/bug_report_template.yml
index 19275180..7af7aa0f 100644
--- a/.github/ISSUE_TEMPLATE/bug_report_template.yml
+++ b/.github/ISSUE_TEMPLATE/bug_report_template.yml
@@ -25,7 +25,7 @@ body:
     id: reproduction
     attributes:
       label: Reproduction
-      description: Please provide the steps necessary to reproduce your issue, including the command-line flags that were used (like `--chat`, `--gpu-memory`, etc).
+      description: Please provide the steps necessary to reproduce your issue, including the command-line flags that were used (`--chat`, `--gpu-memory`, etc).
       placeholder: Reproduction
     validations:
       required: true

From 68464279e036beb7661ebc7ae7566e6f3de4e8de Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 13 Mar 2023 18:19:07 -0300
Subject: [PATCH 17/45] Update feature_request.md

---
 .github/ISSUE_TEMPLATE/feature_request.md | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
index bbcbbe7d..66149832 100644
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -1,20 +1,16 @@
 ---
 name: Feature request
-about: Suggest an idea for this project
+about: Suggest an improvement or new feature for the web UI
 title: ''
 labels: ''
 assignees: ''
 
 ---
+- [ ] I have searched to see if a similar issue already exists.
 
-**Is your feature request related to a problem? Please describe.**
+
+**Is your feature request related to a problem? Please describe.**  
 A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
 
-**Describe the solution you'd like**
+**Describe the feature you'd like**  
 A clear and concise description of what you want to happen.
-
-**Describe alternatives you've considered**
-A clear and concise description of any alternative solutions or features you've considered.
-
-**Additional context**
-Add any other context or screenshots about the feature request here.

From c6ecb354e5921d4c6aa2742e1476a3aa284a6d2d Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 13 Mar 2023 18:26:28 -0300
Subject: [PATCH 18/45] Update feature_request.md

---
 .github/ISSUE_TEMPLATE/feature_request.md | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
index 66149832..83bd0d87 100644
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -6,11 +6,12 @@ labels: ''
 assignees: ''
 
 ---
-- [ ] I have searched to see if a similar issue already exists.
 
+**Description**
 
-**Is your feature request related to a problem? Please describe.**  
-A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+A clear and concise description of what you want to be implemented.
+
+**Additional Context**
+
+Any additional information or context that might be helpful, including relevant external URLs and screenshots if any.
 
-**Describe the feature you'd like**  
-A clear and concise description of what you want to happen.

From d6763a6560e0c1cfc487fed70e296daf921465fc Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 13 Mar 2023 18:27:24 -0300
Subject: [PATCH 19/45] Update feature_request.md

---
 .github/ISSUE_TEMPLATE/feature_request.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
index 83bd0d87..2bdf1b9a 100644
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -13,5 +13,4 @@ A clear and concise description of what you want to be implemented.
 
 **Additional Context**
 
-Any additional information or context that might be helpful, including relevant external URLs and screenshots if any.
-
+If applicable, please provide any extra information, external links, or screenshots that could be useful.

From 511b1368b8f25619eb55591367d513b56cb1ba8b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 13 Mar 2023 18:29:38 -0300
Subject: [PATCH 20/45] Update bug_report_template.yml

---
 .github/ISSUE_TEMPLATE/bug_report_template.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ISSUE_TEMPLATE/bug_report_template.yml b/.github/ISSUE_TEMPLATE/bug_report_template.yml
index 7af7aa0f..d55ce9f9 100644
--- a/.github/ISSUE_TEMPLATE/bug_report_template.yml
+++ b/.github/ISSUE_TEMPLATE/bug_report_template.yml
@@ -46,7 +46,7 @@ body:
     id: system-info
     attributes:
       label: System Info
-      description: "Please share your system info with us: operating system and GPU brand/model."
+      description: "Please share your system info with us: operating system, GPU brand, and GPU model. If you are using a Google Colab notebook, mention that instead."
       render: shell
       placeholder: 
     validations:

From 47c941c5fd0ba146a7d07be998811e135cf4e2ba Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 13 Mar 2023 18:37:35 -0300
Subject: [PATCH 21/45] Create stale.yml

---
 .github/workflow/stale.yml | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)
 create mode 100644 .github/workflow/stale.yml

diff --git a/.github/workflow/stale.yml b/.github/workflow/stale.yml
new file mode 100644
index 00000000..edfafd24
--- /dev/null
+++ b/.github/workflow/stale.yml
@@ -0,0 +1,22 @@
+name: Close inactive issues
+on:
+  schedule:
+    - cron: "38 21 * * *"
+
+jobs:
+  close-issues:
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+      pull-requests: write
+    steps:
+      - uses: actions/stale@v5
+        with:
+          days-before-issue-stale: -1
+          days-before-issue-close: 30
+          stale-issue-label: "stale"
+          stale-issue-message: "This issue is stale because it has been open for 30 days with no activity."
+          close-issue-message: "This issue was closed because it has been inactive for 30 days. If you think that it is still relevant, feel free to reopen it (if you are the author) or comment down below."
+          days-before-pr-stale: -1
+          days-before-pr-close: -1
+          repo-token: ${{ secrets.GITHUB_TOKEN }}

From 7c17613addc45cb275e181630c27c3baa1514648 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 13 Mar 2023 18:47:31 -0300
Subject: [PATCH 22/45] Update and rename .github/workflow/stale.yml to
 .github/workflows/stale.yml

---
 .github/{workflow => workflows}/stale.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 rename .github/{workflow => workflows}/stale.yml (96%)

diff --git a/.github/workflow/stale.yml b/.github/workflows/stale.yml
similarity index 96%
rename from .github/workflow/stale.yml
rename to .github/workflows/stale.yml
index edfafd24..98fbe377 100644
--- a/.github/workflow/stale.yml
+++ b/.github/workflows/stale.yml
@@ -1,7 +1,7 @@
 name: Close inactive issues
 on:
   schedule:
-    - cron: "38 21 * * *"
+    - cron: "48 21 * * *"
 
 jobs:
   close-issues:

From 60cc7d3c3ac11edf840484e7f63190c6898b361b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 13 Mar 2023 18:53:11 -0300
Subject: [PATCH 23/45] Update stale.yml

---
 .github/workflows/stale.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index 98fbe377..302ce094 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -1,7 +1,7 @@
 name: Close inactive issues
 on:
   schedule:
-    - cron: "48 21 * * *"
+    - cron: "55 21 * * *"
 
 jobs:
   close-issues:

From c805843b41bb472a7e7021db7b6096f5c4a95999 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 13 Mar 2023 19:09:06 -0300
Subject: [PATCH 24/45] Update stale.yml

---
 .github/workflows/stale.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index 302ce094..c837034b 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -1,7 +1,7 @@
 name: Close inactive issues
 on:
   schedule:
-    - cron: "55 21 * * *"
+    - cron: "07 23 * * *"
 
 jobs:
   close-issues:

From bad0b0af48392425659dd792b4481bbdb0b59a58 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 13 Mar 2023 19:20:18 -0300
Subject: [PATCH 25/45] Update stale.yml

---
 .github/workflows/stale.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index c837034b..fd69b157 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -12,7 +12,7 @@ jobs:
     steps:
       - uses: actions/stale@v5
         with:
-          days-before-issue-stale: -1
+          days-before-issue-stale: 30
           days-before-issue-close: 30
           stale-issue-label: "stale"
           stale-issue-message: "This issue is stale because it has been open for 30 days with no activity."

From 15714580da3dddcd8fd4baf6efe02804cdf7893b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 13 Mar 2023 19:39:21 -0300
Subject: [PATCH 26/45] Update stale.yml

---
 .github/workflows/stale.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index fd69b157..c9c11e1b 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -1,7 +1,7 @@
 name: Close inactive issues
 on:
   schedule:
-    - cron: "07 23 * * *"
+    - cron: "50 22 * * *"
 
 jobs:
   close-issues:
@@ -12,11 +12,11 @@ jobs:
     steps:
       - uses: actions/stale@v5
         with:
+          stale-issue-message: ""
+          close-issue-message: "This issue has been closed due to inactivity for 30 days. If you believe it is still relevant, you can reopen it (if you are the author) or leave a comment below."
           days-before-issue-stale: 30
           days-before-issue-close: 30
           stale-issue-label: "stale"
-          stale-issue-message: "This issue is stale because it has been open for 30 days with no activity."
-          close-issue-message: "This issue was closed because it has been inactive for 30 days. If you think that it is still relevant, feel free to reopen it (if you are the author) or comment down below."
           days-before-pr-stale: -1
           days-before-pr-close: -1
           repo-token: ${{ secrets.GITHUB_TOKEN }}

From 79e519cff6fe769fe27aff65a94d3a8bf4c48c1e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 13 Mar 2023 20:03:08 -0300
Subject: [PATCH 27/45] Update stale.yml

---
 .github/workflows/stale.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index c9c11e1b..82cd1701 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -1,7 +1,7 @@
 name: Close inactive issues
 on:
   schedule:
-    - cron: "50 22 * * *"
+    - cron: "10 23 * * *"
 
 jobs:
   close-issues:
@@ -15,7 +15,7 @@ jobs:
           stale-issue-message: ""
           close-issue-message: "This issue has been closed due to inactivity for 30 days. If you believe it is still relevant, you can reopen it (if you are the author) or leave a comment below."
           days-before-issue-stale: 30
-          days-before-issue-close: 30
+          days-before-issue-close: 0
           stale-issue-label: "stale"
           days-before-pr-stale: -1
           days-before-pr-close: -1

From 0dab2c5dc5ac36738341e83a6ac2632008c8db77 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 13 Mar 2023 22:18:03 -0300
Subject: [PATCH 28/45] Update feature_request.md

---
 .github/ISSUE_TEMPLATE/feature_request.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
index 2bdf1b9a..b94974f8 100644
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -2,7 +2,7 @@
 name: Feature request
 about: Suggest an improvement or new feature for the web UI
 title: ''
-labels: ''
+labels: 'enhancement'
 assignees: ''
 
 ---

From 3fb8196e167d9fa466453496e9172d03f524d364 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 13 Mar 2023 22:28:00 -0300
Subject: [PATCH 29/45] Implement "*Is recording a voice message...*" for TTS
 #303

---
 extensions/silero_tts/script.py |  2 ++
 modules/chat.py                 | 10 ++++++----
 modules/shared.py               |  1 +
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/extensions/silero_tts/script.py b/extensions/silero_tts/script.py
index 1d068229..f611dc27 100644
--- a/extensions/silero_tts/script.py
+++ b/extensions/silero_tts/script.py
@@ -81,6 +81,7 @@ def input_modifier(string):
     if (shared.args.chat or shared.args.cai_chat) and len(shared.history['internal']) > 0:
         shared.history['visible'][-1] = [shared.history['visible'][-1][0], shared.history['visible'][-1][1].replace('controls autoplay>','controls>')]
 
+    shared.processing_message = "*Is recording a voice message...*"
     return string
 
 def output_modifier(string):
@@ -119,6 +120,7 @@ def output_modifier(string):
         if params['show_text']:
             string += f'\n\n{original_string}'
 
+    shared.processing_message = "*Is typing...*"
     return string
 
 def bot_prefix_modifier(string):
diff --git a/modules/chat.py b/modules/chat.py
index d78278c4..bd45b879 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -126,8 +126,9 @@ def chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical
     else:
         prompt = custom_generate_chat_prompt(text, max_new_tokens, name1, name2, context, chat_prompt_size)
 
+    # Yield *Is typing...*
     if not regenerate:
-        yield shared.history['visible']+[[visible_text, '*Is typing...*']]
+        yield shared.history['visible']+[[visible_text, shared.processing_message]]
 
     # Generate
     reply = ''
@@ -168,7 +169,8 @@ def impersonate_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typ
     prompt = generate_chat_prompt(text, max_new_tokens, name1, name2, context, chat_prompt_size, impersonate=True)
 
     reply = ''
-    yield '*Is typing...*'
+    # Yield *Is typing...*
+    yield shared.processing_message
     for i in range(chat_generation_attempts):
         for reply in generate_reply(prompt+reply, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, eos_token=eos_token, stopping_string=f"\n{name2}:"):
             reply, next_character_found = extract_message_from_reply(prompt, reply, name1, name2, check, impersonate=True)
@@ -187,8 +189,8 @@ def regenerate_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typi
     else:
         last_visible = shared.history['visible'].pop()
         last_internal = shared.history['internal'].pop()
-
-        yield generate_chat_output(shared.history['visible']+[[last_visible[0], '*Is typing...*']], name1, name2, shared.character)
+        # Yield '*Is typing...*'
+        yield generate_chat_output(shared.history['visible']+[[last_visible[0], shared.processing_message]], name1, name2, shared.character)
         for _history in chatbot_wrapper(last_internal[0], max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, name1, name2, context, check, chat_prompt_size, chat_generation_attempts, regenerate=True):
             if shared.args.cai_chat:
                 shared.history['visible'][-1] = [last_visible[0], _history[-1][1]]
diff --git a/modules/shared.py b/modules/shared.py
index 8fcd4745..5411009a 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -11,6 +11,7 @@ is_RWKV = False
 history = {'internal': [], 'visible': []}
 character = 'None'
 stop_everything = False
+processing_message = '*Is typing...*'
 
 # UI elements (buttons, sliders, HTML, etc)
 gradio = {}

From a0ef82c895ef5e93c27ed92b0e92452562b22c83 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 13 Mar 2023 22:35:28 -0300
Subject: [PATCH 30/45] Activate dependabot

---
 .github/dependabot.yml | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 .github/dependabot.yml

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 00000000..91abb11f
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,11 @@
+# To get started with Dependabot version updates, you'll need to specify which
+# package ecosystems to update and where the package manifests are located.
+# Please see the documentation for all configuration options:
+# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
+
+version: 2
+updates:
+  - package-ecosystem: "pip" # See documentation for possible values
+    directory: "/" # Location of package manifests
+    schedule:
+      interval: "weekly"

From 715c3ecba6a0f2102b9e36cda0395697ac7fd722 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 14 Mar 2023 01:36:02 +0000
Subject: [PATCH 31/45] Bump rwkv from 0.3.1 to 0.4.2

Bumps [rwkv](https://github.com/BlinkDL/ChatRWKV) from 0.3.1 to 0.4.2.
- [Release notes](https://github.com/BlinkDL/ChatRWKV/releases)
- [Commits](https://github.com/BlinkDL/ChatRWKV/commits)

---
updated-dependencies:
- dependency-name: rwkv
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 6d0095aa..8eeaa995 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,7 +4,7 @@ flexgen==0.1.7
 gradio==3.18.0
 numpy
 requests
-rwkv==0.3.1
+rwkv==0.4.2
 safetensors==0.3.0
 sentencepiece
 tqdm

From df830885939eba8518c2b378e4747e70a752fc0c Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 14 Mar 2023 01:36:18 +0000
Subject: [PATCH 32/45] Bump bitsandbytes from 0.37.0 to 0.37.1

Bumps [bitsandbytes](https://github.com/TimDettmers/bitsandbytes) from 0.37.0 to 0.37.1.
- [Release notes](https://github.com/TimDettmers/bitsandbytes/releases)
- [Changelog](https://github.com/TimDettmers/bitsandbytes/blob/main/CHANGELOG.md)
- [Commits](https://github.com/TimDettmers/bitsandbytes/commits)

---
updated-dependencies:
- dependency-name: bitsandbytes
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 6d0095aa..94e5a48b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 accelerate==0.17.0
-bitsandbytes==0.37.0
+bitsandbytes==0.37.1
 flexgen==0.1.7
 gradio==3.18.0
 numpy

From 02d407542c7b9cdeca085d5978a3449e11b711c9 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 14 Mar 2023 01:40:42 +0000
Subject: [PATCH 33/45] Bump accelerate from 0.17.0 to 0.17.1

Bumps [accelerate](https://github.com/huggingface/accelerate) from 0.17.0 to 0.17.1.
- [Release notes](https://github.com/huggingface/accelerate/releases)
- [Commits](https://github.com/huggingface/accelerate/compare/v0.17.0...v0.17.1)

---
updated-dependencies:
- dependency-name: accelerate
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 4950905b..9bb2b74f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-accelerate==0.17.0
+accelerate==0.17.1
 bitsandbytes==0.37.1
 flexgen==0.1.7
 gradio==3.18.0

From b5e0d3c2273bb161f41cb9fffd62beb0f5ebf858 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 13 Mar 2023 23:02:25 -0300
Subject: [PATCH 34/45] Create config.yml

---
 .github/ISSUE_TEMPLATE/config.yml | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 .github/ISSUE_TEMPLATE/config.yml

diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 00000000..0d091e28
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,6 @@
+blank_issues_enabled: true
+version: 2.1
+contact_links:
+  - name: Forum
+    url: https://github.com/oobabooga/text-generation-webui/discussions
+    about: General usage questions and community discussions

From 33b9a15232f80dbde843e44cb436c15ae60d485a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 13 Mar 2023 23:03:16 -0300
Subject: [PATCH 35/45] Delete config.yml

---
 .github/ISSUE_TEMPLATE/config.yml | 6 ------
 1 file changed, 6 deletions(-)
 delete mode 100644 .github/ISSUE_TEMPLATE/config.yml

diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
deleted file mode 100644
index 0d091e28..00000000
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ /dev/null
@@ -1,6 +0,0 @@
-blank_issues_enabled: true
-version: 2.1
-contact_links:
-  - name: Forum
-    url: https://github.com/oobabooga/text-generation-webui/discussions
-    about: General usage questions and community discussions

From b3275545509a52ac54b1230c7169edee90495e4f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 14 Mar 2023 00:18:13 -0300
Subject: [PATCH 36/45] Update bug_report_template.yml

---
 .github/ISSUE_TEMPLATE/bug_report_template.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ISSUE_TEMPLATE/bug_report_template.yml b/.github/ISSUE_TEMPLATE/bug_report_template.yml
index d55ce9f9..bd30a0c9 100644
--- a/.github/ISSUE_TEMPLATE/bug_report_template.yml
+++ b/.github/ISSUE_TEMPLATE/bug_report_template.yml
@@ -25,7 +25,7 @@ body:
     id: reproduction
     attributes:
       label: Reproduction
-      description: Please provide the steps necessary to reproduce your issue, including the command-line flags that were used (`--chat`, `--gpu-memory`, etc).
+      description: Please provide the steps necessary to reproduce your issue.
       placeholder: Reproduction
     validations:
       required: true

From 265ba384b7e5e928d97d2749b25771b7d3d93fde Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 14 Mar 2023 07:56:31 -0300
Subject: [PATCH 37/45] Rename a file, add deprecation warning for
 --load-in-4bit

---
 modules/{quant_loader.py => GPTQ_loader.py} | 0
 modules/models.py                           | 2 +-
 modules/shared.py                           | 6 ++++++
 3 files changed, 7 insertions(+), 1 deletion(-)
 rename modules/{quant_loader.py => GPTQ_loader.py} (100%)

diff --git a/modules/quant_loader.py b/modules/GPTQ_loader.py
similarity index 100%
rename from modules/quant_loader.py
rename to modules/GPTQ_loader.py
diff --git a/modules/models.py b/modules/models.py
index 46cd77ff..f4bb11fd 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -89,7 +89,7 @@ def load_model(model_name):
 
     # Quantized model
     elif shared.args.gptq_bits > 0:
-        from modules.quant_loader import load_quantized
+        from modules.GPTQ_loader import load_quantized
 
         model = load_quantized(model_name)
 
diff --git a/modules/shared.py b/modules/shared.py
index 3abdc551..ea2eb50b 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -69,6 +69,7 @@ parser.add_argument('--chat', action='store_true', help='Launch the web UI in ch
 parser.add_argument('--cai-chat', action='store_true', help='Launch the web UI in chat mode with a style similar to Character.AI\'s. If the file img_bot.png or img_bot.jpg exists in the same folder as server.py, this image will be used as the bot\'s profile picture. Similarly, img_me.png or img_me.jpg will be used as your profile picture.')
 parser.add_argument('--cpu', action='store_true', help='Use the CPU to generate text.')
 parser.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision.')
+parser.add_argument('--load-in-4bit', action='store_true', help='DEPRECATED: use --gptq-bits 4 instead.')
 parser.add_argument('--gptq-bits', type=int, default=0, help='Load a pre-quantized model with specified precision. 2, 3, 4 and 8bit are supported. Currently only works with LLaMA and OPT.')
 parser.add_argument('--gptq-model-type', type=str, help='Model type of pre-quantized model. Currently only LLaMa and OPT are supported.')
 parser.add_argument('--bf16', action='store_true', help='Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.')
@@ -95,3 +96,8 @@ parser.add_argument('--share', action='store_true', help='Create a public URL. T
 parser.add_argument('--auto-launch', action='store_true', default=False, help='Open the web UI in the default browser upon launch.')
 parser.add_argument('--verbose', action='store_true', help='Print the prompts to the terminal.')
 args = parser.parse_args()
+
+# Provisional, this will be deleted later
+if args.load_in_4bit:
+    print("Warning: --load-in-4bit is deprecated and will be removed. Use --gptq-bits 4 instead.\n")
+    args.gptq_bits = 4

From 87192e2813181ca4241c97bd258e19d52ff6f204 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 14 Mar 2023 08:02:21 -0300
Subject: [PATCH 38/45] Update README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f66126d8..1c267739 100644
--- a/README.md
+++ b/README.md
@@ -140,7 +140,7 @@ Optionally, you can use the following command-line flags:
 | `--cai-chat`  | Launch the web UI in chat mode with a style similar to Character.AI's. If the file `img_bot.png` or `img_bot.jpg` exists in the same folder as server.py, this image will be used as the bot's profile picture. Similarly, `img_me.png` or `img_me.jpg` will be used as your profile picture. |
 | `--cpu`       | Use the CPU to generate text.|
 | `--load-in-8bit`  | Load the model with 8-bit precision.|
-| `--load-in-4bit`  | Load the model with 4-bit precision. Currently only works with LLaMA.|
+| `--load-in-4bit`  | DEPRECATED: use `--gptq-bits 4` instead. |
 | `--gptq-bits GPTQ_BITS`  |  Load a pre-quantized model with specified precision. 2, 3, 4 and 8 (bit) are supported. Currently only works with LLaMA and OPT. |
 | `--gptq-model-type MODEL_TYPE`  |  Model type of pre-quantized model. Currently only LLaMa and OPT are supported. |
 | `--bf16`  | Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU. |

From afc5339510c71cb6a17a7e6ba9bc432545f03b83 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 14 Mar 2023 16:04:17 -0300
Subject: [PATCH 39/45] Remove "eval" statements from text generation functions

---
 modules/text_generation.py | 65 ++++++++++++++++++++------------------
 1 file changed, 34 insertions(+), 31 deletions(-)

diff --git a/modules/text_generation.py b/modules/text_generation.py
index d64481b2..70a51d91 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -122,7 +122,7 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
     input_ids = encode(question, max_new_tokens)
     original_input_ids = input_ids
     output = input_ids[0]
-    cuda = "" if (shared.args.cpu or shared.args.deepspeed or shared.args.flexgen) else ".cuda()"
+    cuda = not any((shared.args.cpu, shared.args.deepspeed, shared.args.flexgen))
     eos_token_ids = [shared.tokenizer.eos_token_id] if shared.tokenizer.eos_token_id is not None else []
     if eos_token is not None:
         eos_token_ids.append(int(encode(eos_token)[0][-1]))
@@ -132,45 +132,48 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
         t = encode(stopping_string, 0, add_special_tokens=False)
         stopping_criteria_list.append(_SentinelTokenStoppingCriteria(sentinel_token_ids=t, starting_idx=len(input_ids[0])))
 
+    generate_params = {}
     if not shared.args.flexgen:
-        generate_params = [
-            f"max_new_tokens=max_new_tokens",
-            f"eos_token_id={eos_token_ids}",
-            f"stopping_criteria=stopping_criteria_list",
-            f"do_sample={do_sample}",
-            f"temperature={temperature}",
-            f"top_p={top_p}",
-            f"typical_p={typical_p}",
-            f"repetition_penalty={repetition_penalty}",
-            f"top_k={top_k}",
-            f"min_length={min_length if shared.args.no_stream else 0}",
-            f"no_repeat_ngram_size={no_repeat_ngram_size}",
-            f"num_beams={num_beams}",
-            f"penalty_alpha={penalty_alpha}",
-            f"length_penalty={length_penalty}",
-            f"early_stopping={early_stopping}",
-        ]
+        generate_params.update({
+            "max_new_tokens": max_new_tokens,
+            "eos_token_id": eos_token_ids,
+            "stopping_criteria": stopping_criteria_list,
+            "do_sample": do_sample,
+            "temperature": temperature,
+            "top_p": top_p,
+            "typical_p": typical_p,
+            "repetition_penalty": repetition_penalty,
+            "top_k": top_k,
+            "min_length": min_length if shared.args.no_stream else 0,
+            "no_repeat_ngram_size": no_repeat_ngram_size,
+            "num_beams": num_beams,
+            "penalty_alpha": penalty_alpha,
+            "length_penalty": length_penalty,
+            "early_stopping": early_stopping,
+        })
     else:
-        generate_params = [
-            f"max_new_tokens={max_new_tokens if shared.args.no_stream else 8}",
-            f"do_sample={do_sample}",
-            f"temperature={temperature}",
-            f"stop={eos_token_ids[-1]}",
-        ]
+        generate_params.update({
+            "max_new_tokens": max_new_tokens if shared.args.no_stream else 8,
+            "do_sample": do_sample,
+            "temperature": temperature,
+            "stop": eos_token_ids[-1],
+        })
     if shared.args.deepspeed:
-        generate_params.append("synced_gpus=True")
+        generate_params.update({"synced_gpus": True})
     if shared.soft_prompt:
         inputs_embeds, filler_input_ids = generate_softprompt_input_tensors(input_ids)
-        generate_params.insert(0, "inputs_embeds=inputs_embeds")
-        generate_params.insert(0, "inputs=filler_input_ids")
+        generate_params.update({"inputs_embeds": inputs_embeds})
+        generate_params.update({"inputs": filler_input_ids})
     else:
-        generate_params.insert(0, "inputs=input_ids")
+        generate_params.update({"inputs": input_ids})
 
     try:
         # Generate the entire reply at once.
         if shared.args.no_stream:
             with torch.no_grad():
-                output = eval(f"shared.model.generate({', '.join(generate_params)}){cuda}")[0]
+                output = shared.model.generate(**generate_params)[0]
+                if cuda:
+                    output = output.cuda()
             if shared.soft_prompt:
                 output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
 
@@ -194,7 +197,7 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
                 return Iteratorize(generate_with_callback, kwargs, callback=None)
 
             yield formatted_outputs(original_question, shared.model_name)
-            with eval(f"generate_with_streaming({', '.join(generate_params)})") as generator:
+            with generate_with_streaming(**generate_params) as generator:
                 for output in generator:
                     if shared.soft_prompt:
                         output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
@@ -214,7 +217,7 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
             for i in range(max_new_tokens//8+1):
                 clear_torch_cache()
                 with torch.no_grad():
-                    output = eval(f"shared.model.generate({', '.join(generate_params)})")[0]
+                    output = shared.model.generate(**generate_params)[0]
                 if shared.soft_prompt:
                     output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
                 reply = decode(output)

From 72d207c0980232db287f9ce89ec4dd3b032465e5 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 14 Mar 2023 16:31:27 -0300
Subject: [PATCH 40/45] Remove the chat API

It is not implemented, has not been tested, and this is causing confusion.
---
 server.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server.py b/server.py
index 08b1a478..a54e3b62 100644
--- a/server.py
+++ b/server.py
@@ -269,7 +269,7 @@ if shared.args.chat or shared.args.cai_chat:
 
         function_call = 'chat.cai_chatbot_wrapper' if shared.args.cai_chat else 'chat.chatbot_wrapper'
 
-        gen_events.append(shared.gradio['Generate'].click(eval(function_call), shared.input_params, shared.gradio['display'], show_progress=shared.args.no_stream, api_name='textgen'))
+        gen_events.append(shared.gradio['Generate'].click(eval(function_call), shared.input_params, shared.gradio['display'], show_progress=shared.args.no_stream))
         gen_events.append(shared.gradio['textbox'].submit(eval(function_call), shared.input_params, shared.gradio['display'], show_progress=shared.args.no_stream))
         gen_events.append(shared.gradio['Regenerate'].click(chat.regenerate_wrapper, shared.input_params, shared.gradio['display'], show_progress=shared.args.no_stream))
         gen_events.append(shared.gradio['Impersonate'].click(chat.impersonate_wrapper, shared.input_params, shared.gradio['textbox'], show_progress=shared.args.no_stream))

From b419dffba3592792dd6292b7d6f4943850fd6195 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 14 Mar 2023 17:55:35 -0300
Subject: [PATCH 41/45] Update README.md

---
 README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 1c267739..141d549d 100644
--- a/README.md
+++ b/README.md
@@ -60,7 +60,9 @@ pip3 install torch torchvision torchaudio --extra-index-url https://download.pyt
 conda install pytorch torchvision torchaudio git -c pytorch
 ```
 
-See also: [Installation instructions for human beings](https://github.com/oobabooga/text-generation-webui/wiki/Installation-instructions-for-human-beings).
+> **Note**
+> 1. If you are on Windows, it may be easier to run the commands above in an WSL environment. The performance may also be better.
+> 2. For more detailed, user-contributed guide, see: [Installation instructions for human beings](https://github.com/oobabooga/text-generation-webui/wiki/Installation-instructions-for-human-beings).
 
 ## Installation option 2: one-click installers
 

From 1236c7f97153b1de6905ed46a4293d3e625f1894 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 14 Mar 2023 17:56:15 -0300
Subject: [PATCH 42/45] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 141d549d..2bfbd2f8 100644
--- a/README.md
+++ b/README.md
@@ -61,7 +61,7 @@ conda install pytorch torchvision torchaudio git -c pytorch
 ```
 
 > **Note**
-> 1. If you are on Windows, it may be easier to run the commands above in an WSL environment. The performance may also be better.
+> 1. If you are on Windows, it may be easier to run the commands above in a WSL environment. The performance may also be better.
 > 2. For more detailed, user-contributed guide, see: [Installation instructions for human beings](https://github.com/oobabooga/text-generation-webui/wiki/Installation-instructions-for-human-beings).
 
 ## Installation option 2: one-click installers

From 128d18e2988c5f42f95476cc09c89aff9741e44a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 14 Mar 2023 17:57:25 -0300
Subject: [PATCH 43/45] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2bfbd2f8..c9834558 100644
--- a/README.md
+++ b/README.md
@@ -62,7 +62,7 @@ conda install pytorch torchvision torchaudio git -c pytorch
 
 > **Note**
 > 1. If you are on Windows, it may be easier to run the commands above in a WSL environment. The performance may also be better.
-> 2. For more detailed, user-contributed guide, see: [Installation instructions for human beings](https://github.com/oobabooga/text-generation-webui/wiki/Installation-instructions-for-human-beings).
+> 2. For a more detailed, user-contributed guide, see: [Installation instructions for human beings](https://github.com/oobabooga/text-generation-webui/wiki/Installation-instructions-for-human-beings).
 
 ## Installation option 2: one-click installers
 

From 9d6a625bd6f2e7939ef2fa6dc4576c5744a84d83 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 15 Mar 2023 11:04:30 -0300
Subject: [PATCH 44/45] Add 'hallucinations' filter #326

This breaks the API since a new parameter has been added.
It should be a one-line fix. See api-example.py.
---
 api-example-stream.py      |  2 ++
 api-example.py             |  2 ++
 modules/chat.py            | 16 ++++++++--------
 modules/text_generation.py |  3 ++-
 server.py                  | 20 +++++++++++---------
 5 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/api-example-stream.py b/api-example-stream.py
index a5ed4202..add1df41 100644
--- a/api-example-stream.py
+++ b/api-example-stream.py
@@ -26,6 +26,7 @@ async def run(context):
         'top_p': 0.9,
         'typical_p': 1,
         'repetition_penalty': 1.05,
+        'encoder_repetition_penalty': 1.0,
         'top_k': 0,
         'min_length': 0,
         'no_repeat_ngram_size': 0,
@@ -59,6 +60,7 @@ async def run(context):
                             params['top_p'],
                             params['typical_p'],
                             params['repetition_penalty'],
+                            params['encoder_repetition_penalty'],
                             params['top_k'],
                             params['min_length'],
                             params['no_repeat_ngram_size'],
diff --git a/api-example.py b/api-example.py
index 0306b7ab..a6f0c10e 100644
--- a/api-example.py
+++ b/api-example.py
@@ -24,6 +24,7 @@ params = {
     'top_p': 0.9,
     'typical_p': 1,
     'repetition_penalty': 1.05,
+    'encoder_repetition_penalty': 1.0,
     'top_k': 0,
     'min_length': 0,
     'no_repeat_ngram_size': 0,
@@ -45,6 +46,7 @@ response = requests.post(f"http://{server}:7860/run/textgen", json={
         params['top_p'],
         params['typical_p'],
         params['repetition_penalty'],
+        params['encoder_repetition_penalty'],
         params['top_k'],
         params['min_length'],
         params['no_repeat_ngram_size'],
diff --git a/modules/chat.py b/modules/chat.py
index bd45b879..d7202bee 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -97,7 +97,7 @@ def extract_message_from_reply(question, reply, name1, name2, check, impersonate
 def stop_everything_event():
     shared.stop_everything = True
 
-def chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, name1, name2, context, check, chat_prompt_size, chat_generation_attempts=1, regenerate=False):
+def chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, name1, name2, context, check, chat_prompt_size, chat_generation_attempts=1, regenerate=False):
     shared.stop_everything = False
     just_started = True
     eos_token = '\n' if check else None
@@ -133,7 +133,7 @@ def chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical
     # Generate
     reply = ''
     for i in range(chat_generation_attempts):
-        for reply in generate_reply(f"{prompt}{' ' if len(reply) > 0 else ''}{reply}", max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, eos_token=eos_token, stopping_string=f"\n{name1}:"):
+        for reply in generate_reply(f"{prompt}{' ' if len(reply) > 0 else ''}{reply}", max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, eos_token=eos_token, stopping_string=f"\n{name1}:"):
 
             # Extracting the reply
             reply, next_character_found = extract_message_from_reply(prompt, reply, name1, name2, check)
@@ -160,7 +160,7 @@ def chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical
 
     yield shared.history['visible']
 
-def impersonate_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, name1, name2, context, check, chat_prompt_size, chat_generation_attempts=1):
+def impersonate_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, name1, name2, context, check, chat_prompt_size, chat_generation_attempts=1):
     eos_token = '\n' if check else None
 
     if 'pygmalion' in shared.model_name.lower():
@@ -172,18 +172,18 @@ def impersonate_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typ
     # Yield *Is typing...*
     yield shared.processing_message
     for i in range(chat_generation_attempts):
-        for reply in generate_reply(prompt+reply, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, eos_token=eos_token, stopping_string=f"\n{name2}:"):
+        for reply in generate_reply(prompt+reply, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, eos_token=eos_token, stopping_string=f"\n{name2}:"):
             reply, next_character_found = extract_message_from_reply(prompt, reply, name1, name2, check, impersonate=True)
             yield reply
             if next_character_found:
                 break
         yield reply
 
-def cai_chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, name1, name2, context, check, chat_prompt_size, chat_generation_attempts=1):
-    for _history in chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, name1, name2, context, check, chat_prompt_size, chat_generation_attempts):
+def cai_chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, name1, name2, context, check, chat_prompt_size, chat_generation_attempts=1):
+    for _history in chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, name1, name2, context, check, chat_prompt_size, chat_generation_attempts):
         yield generate_chat_html(_history, name1, name2, shared.character)
 
-def regenerate_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, name1, name2, context, check, chat_prompt_size, chat_generation_attempts=1):
+def regenerate_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, name1, name2, context, check, chat_prompt_size, chat_generation_attempts=1):
     if (shared.character != 'None' and len(shared.history['visible']) == 1) or len(shared.history['internal']) == 0:
         yield generate_chat_output(shared.history['visible'], name1, name2, shared.character)
     else:
@@ -191,7 +191,7 @@ def regenerate_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typi
         last_internal = shared.history['internal'].pop()
         # Yield '*Is typing...*'
         yield generate_chat_output(shared.history['visible']+[[last_visible[0], shared.processing_message]], name1, name2, shared.character)
-        for _history in chatbot_wrapper(last_internal[0], max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, name1, name2, context, check, chat_prompt_size, chat_generation_attempts, regenerate=True):
+        for _history in chatbot_wrapper(last_internal[0], max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, name1, name2, context, check, chat_prompt_size, chat_generation_attempts, regenerate=True):
             if shared.args.cai_chat:
                 shared.history['visible'][-1] = [last_visible[0], _history[-1][1]]
             else:
diff --git a/modules/text_generation.py b/modules/text_generation.py
index 70a51d91..f302a918 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -89,7 +89,7 @@ def clear_torch_cache():
     if not shared.args.cpu:
         torch.cuda.empty_cache()
 
-def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, eos_token=None, stopping_string=None):
+def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, eos_token=None, stopping_string=None):
     clear_torch_cache()
     t0 = time.time()
 
@@ -143,6 +143,7 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
             "top_p": top_p,
             "typical_p": typical_p,
             "repetition_penalty": repetition_penalty,
+            "encoder_repetition_penalty": encoder_repetition_penalty,
             "top_k": top_k,
             "min_length": min_length if shared.args.no_stream else 0,
             "no_repeat_ngram_size": no_repeat_ngram_size,
diff --git a/server.py b/server.py
index a54e3b62..4ac81f01 100644
--- a/server.py
+++ b/server.py
@@ -66,6 +66,7 @@ def load_preset_values(preset_menu, return_dict=False):
         'top_p': 1,
         'typical_p': 1,
         'repetition_penalty': 1,
+        'encoder_repetition_penalty': 1,
         'top_k': 50,
         'num_beams': 1,
         'penalty_alpha': 0,
@@ -86,7 +87,7 @@ def load_preset_values(preset_menu, return_dict=False):
     if return_dict:
         return generate_params
     else:
-        return generate_params['do_sample'], generate_params['temperature'], generate_params['top_p'], generate_params['typical_p'], generate_params['repetition_penalty'], generate_params['top_k'], generate_params['min_length'], generate_params['no_repeat_ngram_size'], generate_params['num_beams'], generate_params['penalty_alpha'], generate_params['length_penalty'], generate_params['early_stopping']
+        return generate_params['do_sample'], generate_params['temperature'], generate_params['top_p'], generate_params['typical_p'], generate_params['repetition_penalty'], generate_params['encoder_repetition_penalty'], generate_params['top_k'], generate_params['min_length'], generate_params['no_repeat_ngram_size'], generate_params['num_beams'], generate_params['penalty_alpha'], generate_params['length_penalty'], generate_params['early_stopping']
 
 def upload_soft_prompt(file):
     with zipfile.ZipFile(io.BytesIO(file)) as zf:
@@ -117,14 +118,15 @@ def create_settings_menus(default_preset):
         with gr.Row():
             with gr.Column():
                 shared.gradio['temperature'] = gr.Slider(0.01, 1.99, value=generate_params['temperature'], step=0.01, label='temperature')
-                shared.gradio['repetition_penalty'] = gr.Slider(1.0, 2.99, value=generate_params['repetition_penalty'],step=0.01,label='repetition_penalty')
-                shared.gradio['top_k'] = gr.Slider(0,200,value=generate_params['top_k'],step=1,label='top_k')
                 shared.gradio['top_p'] = gr.Slider(0.0,1.0,value=generate_params['top_p'],step=0.01,label='top_p')
-            with gr.Column():
-                shared.gradio['do_sample'] = gr.Checkbox(value=generate_params['do_sample'], label='do_sample')
+                shared.gradio['top_k'] = gr.Slider(0,200,value=generate_params['top_k'],step=1,label='top_k')
                 shared.gradio['typical_p'] = gr.Slider(0.0,1.0,value=generate_params['typical_p'],step=0.01,label='typical_p')
+            with gr.Column():
+                shared.gradio['repetition_penalty'] = gr.Slider(1.0, 1.5, value=generate_params['repetition_penalty'],step=0.01,label='repetition_penalty')
+                shared.gradio['encoder_repetition_penalty'] = gr.Slider(0.8, 1.5, value=generate_params['encoder_repetition_penalty'],step=0.01,label='encoder_repetition_penalty')
                 shared.gradio['no_repeat_ngram_size'] = gr.Slider(0, 20, step=1, value=generate_params['no_repeat_ngram_size'], label='no_repeat_ngram_size')
                 shared.gradio['min_length'] = gr.Slider(0, 2000, step=1, value=generate_params['min_length'] if shared.args.no_stream else 0, label='min_length', interactive=shared.args.no_stream)
+        shared.gradio['do_sample'] = gr.Checkbox(value=generate_params['do_sample'], label='do_sample')
 
         gr.Markdown('Contrastive search:')
         shared.gradio['penalty_alpha'] = gr.Slider(0, 5, value=generate_params['penalty_alpha'], label='penalty_alpha')
@@ -147,7 +149,7 @@ def create_settings_menus(default_preset):
             shared.gradio['upload_softprompt'] = gr.File(type='binary', file_types=['.zip'])
 
     shared.gradio['model_menu'].change(load_model_wrapper, [shared.gradio['model_menu']], [shared.gradio['model_menu']], show_progress=True)
-    shared.gradio['preset_menu'].change(load_preset_values, [shared.gradio['preset_menu']], [shared.gradio['do_sample'], shared.gradio['temperature'], shared.gradio['top_p'], shared.gradio['typical_p'], shared.gradio['repetition_penalty'], shared.gradio['top_k'], shared.gradio['min_length'], shared.gradio['no_repeat_ngram_size'], shared.gradio['num_beams'], shared.gradio['penalty_alpha'], shared.gradio['length_penalty'], shared.gradio['early_stopping']])
+    shared.gradio['preset_menu'].change(load_preset_values, [shared.gradio['preset_menu']], [shared.gradio['do_sample'], shared.gradio['temperature'], shared.gradio['top_p'], shared.gradio['typical_p'], shared.gradio['repetition_penalty'], shared.gradio['encoder_repetition_penalty'], shared.gradio['top_k'], shared.gradio['min_length'], shared.gradio['no_repeat_ngram_size'], shared.gradio['num_beams'], shared.gradio['penalty_alpha'], shared.gradio['length_penalty'], shared.gradio['early_stopping']])
     shared.gradio['softprompts_menu'].change(load_soft_prompt, [shared.gradio['softprompts_menu']], [shared.gradio['softprompts_menu']], show_progress=True)
     shared.gradio['upload_softprompt'].upload(upload_soft_prompt, [shared.gradio['upload_softprompt']], [shared.gradio['softprompts_menu']])
 
@@ -262,7 +264,7 @@ if shared.args.chat or shared.args.cai_chat:
                 shared.gradio['chat_generation_attempts'] = gr.Slider(minimum=shared.settings['chat_generation_attempts_min'], maximum=shared.settings['chat_generation_attempts_max'], value=shared.settings['chat_generation_attempts'], step=1, label='Generation attempts (for longer replies)')
             create_settings_menus(default_preset)
 
-        shared.input_params = [shared.gradio[k] for k in ['textbox', 'max_new_tokens', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'name1', 'name2', 'context', 'check', 'chat_prompt_size_slider', 'chat_generation_attempts']]
+        shared.input_params = [shared.gradio[k] for k in ['textbox', 'max_new_tokens', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'name1', 'name2', 'context', 'check', 'chat_prompt_size_slider', 'chat_generation_attempts']]
         if shared.args.extensions is not None:
             with gr.Tab('Extensions'):
                 extensions_module.create_extensions_block()
@@ -329,7 +331,7 @@ elif shared.args.notebook:
         if shared.args.extensions is not None:
             extensions_module.create_extensions_block()
 
-        shared.input_params = [shared.gradio[k] for k in ['textbox', 'max_new_tokens', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping']]
+        shared.input_params = [shared.gradio[k] for k in ['textbox', 'max_new_tokens', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping']]
         output_params = [shared.gradio[k] for k in ['textbox', 'markdown', 'html']]
         gen_events.append(shared.gradio['Generate'].click(generate_reply, shared.input_params, output_params, show_progress=shared.args.no_stream, api_name='textgen'))
         gen_events.append(shared.gradio['textbox'].submit(generate_reply, shared.input_params, output_params, show_progress=shared.args.no_stream))
@@ -361,7 +363,7 @@ else:
                 with gr.Tab('HTML'):
                     shared.gradio['html'] = gr.HTML()
 
-        shared.input_params = [shared.gradio[k] for k in ['textbox', 'max_new_tokens', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping']]
+        shared.input_params = [shared.gradio[k] for k in ['textbox', 'max_new_tokens', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping']]
         output_params = [shared.gradio[k] for k in ['output_textbox', 'markdown', 'html']]
         gen_events.append(shared.gradio['Generate'].click(generate_reply, shared.input_params, output_params, show_progress=shared.args.no_stream, api_name='textgen'))
         gen_events.append(shared.gradio['textbox'].submit(generate_reply, shared.input_params, output_params, show_progress=shared.args.no_stream))

From 14139317055f2dd947e54472cc81d69956f25fe3 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 15 Mar 2023 12:01:32 -0300
Subject: [PATCH 45/45] Add a header bar and redesign the interface (#293)

---
 extensions/gallery/script.py |   2 +-
 modules/ui.py                |   9 ++
 server.py                    | 159 ++++++++++++++++++++---------------
 3 files changed, 99 insertions(+), 71 deletions(-)

diff --git a/extensions/gallery/script.py b/extensions/gallery/script.py
index 8a2d7cf9..fbf23bc9 100644
--- a/extensions/gallery/script.py
+++ b/extensions/gallery/script.py
@@ -76,7 +76,7 @@ def generate_html():
     return container_html
 
 def ui():
-    with gr.Accordion("Character gallery"):
+    with gr.Accordion("Character gallery", open=False):
         update = gr.Button("Refresh")
         gallery = gr.HTML(value=generate_html())
     update.click(generate_html, [], gallery)
diff --git a/modules/ui.py b/modules/ui.py
index bb193e35..27233153 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -38,6 +38,9 @@ svg {
 ol li p, ul li p {
     display: inline-block;
 }
+#main, #settings, #extensions, #chat-settings {
+  border: 0;
+}
 """
 
 chat_css = """
@@ -64,6 +67,12 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 }
 """
 
+page_js = """
+document.getElementById("main").parentNode.childNodes[0].style = "border: none; background-color: #8080802b; margin-bottom: 40px"
+document.getElementById("main").parentNode.style = "padding: 0; margin: 0"
+document.getElementById("main").parentNode.parentNode.parentNode.style = "padding: 0"
+"""
+
 class ToolButton(gr.Button, gr.components.FormComponent):
     """Small button with single emoji as text, fits inside gradio forms"""
 
diff --git a/server.py b/server.py
index 4ac81f01..a7ec4888 100644
--- a/server.py
+++ b/server.py
@@ -101,9 +101,7 @@ def upload_soft_prompt(file):
 
     return name
 
-def create_settings_menus(default_preset):
-    generate_params = load_preset_values(default_preset if not shared.args.flexgen else 'Naive', return_dict=True)
-
+def create_model_and_preset_menus():
     with gr.Row():
         with gr.Column():
             with gr.Row():
@@ -114,7 +112,11 @@ def create_settings_menus(default_preset):
                 shared.gradio['preset_menu'] = gr.Dropdown(choices=available_presets, value=default_preset if not shared.args.flexgen else 'Naive', label='Generation parameters preset')
                 ui.create_refresh_button(shared.gradio['preset_menu'], lambda : None, lambda : {'choices': get_available_presets()}, 'refresh-button')
 
-    with gr.Accordion('Custom generation parameters', open=False, elem_id='accordion'):
+def create_settings_menus(default_preset):
+    generate_params = load_preset_values(default_preset if not shared.args.flexgen else 'Naive', return_dict=True)
+
+    with gr.Box():
+        gr.Markdown('Custom generation parameters')
         with gr.Row():
             with gr.Column():
                 shared.gradio['temperature'] = gr.Slider(0.01, 1.99, value=generate_params['temperature'], step=0.01, label='temperature')
@@ -128,9 +130,11 @@ def create_settings_menus(default_preset):
                 shared.gradio['min_length'] = gr.Slider(0, 2000, step=1, value=generate_params['min_length'] if shared.args.no_stream else 0, label='min_length', interactive=shared.args.no_stream)
         shared.gradio['do_sample'] = gr.Checkbox(value=generate_params['do_sample'], label='do_sample')
 
+    with gr.Box():
         gr.Markdown('Contrastive search:')
         shared.gradio['penalty_alpha'] = gr.Slider(0, 5, value=generate_params['penalty_alpha'], label='penalty_alpha')
 
+    with gr.Box():
         gr.Markdown('Beam search (uses a lot of VRAM):')
         with gr.Row():
             with gr.Column():
@@ -139,7 +143,8 @@ def create_settings_menus(default_preset):
                 shared.gradio['length_penalty'] = gr.Slider(-5, 5, value=generate_params['length_penalty'], label='length_penalty')
         shared.gradio['early_stopping'] = gr.Checkbox(value=generate_params['early_stopping'], label='early_stopping')
 
-    with gr.Accordion('Soft prompt', open=False, elem_id='accordion'):
+    with gr.Box():
+        gr.Markdown('Soft prompt')
         with gr.Row():
             shared.gradio['softprompts_menu'] = gr.Dropdown(choices=available_softprompts, value='None', label='Soft prompt')
             ui.create_refresh_button(shared.gradio['softprompts_menu'], lambda : None, lambda : {'choices': get_available_softprompts()}, 'refresh-button')
@@ -202,26 +207,41 @@ suffix = '_pygmalion' if 'pygmalion' in shared.model_name.lower() else ''
 
 if shared.args.chat or shared.args.cai_chat:
     with gr.Blocks(css=ui.css+ui.chat_css, analytics_enabled=False, title=title) as shared.gradio['interface']:
-        if shared.args.cai_chat:
-            shared.gradio['display'] = gr.HTML(value=generate_chat_html(shared.history['visible'], shared.settings[f'name1{suffix}'], shared.settings[f'name2{suffix}'], shared.character))
-        else:
-            shared.gradio['display'] = gr.Chatbot(value=shared.history['visible']).style(color_map=("#326efd", "#212528"))
-        shared.gradio['textbox'] = gr.Textbox(label='Input')
-        with gr.Row():
-            shared.gradio['Stop'] = gr.Button('Stop')
-            shared.gradio['Generate'] = gr.Button('Generate')
-        with gr.Row():
-            shared.gradio['Impersonate'] = gr.Button('Impersonate')
-            shared.gradio['Regenerate'] = gr.Button('Regenerate')
-        with gr.Row():
-            shared.gradio['Copy last reply'] = gr.Button('Copy last reply')
-            shared.gradio['Replace last reply'] = gr.Button('Replace last reply')
-            shared.gradio['Remove last'] = gr.Button('Remove last')
+        with gr.Tab("Text generation", elem_id="main"):
+            if shared.args.cai_chat:
+                shared.gradio['display'] = gr.HTML(value=generate_chat_html(shared.history['visible'], shared.settings[f'name1{suffix}'], shared.settings[f'name2{suffix}'], shared.character))
+            else:
+                shared.gradio['display'] = gr.Chatbot(value=shared.history['visible']).style(color_map=("#326efd", "#212528"))
+            shared.gradio['textbox'] = gr.Textbox(label='Input')
+            with gr.Row():
+                shared.gradio['Stop'] = gr.Button('Stop')
+                shared.gradio['Generate'] = gr.Button('Generate')
+            with gr.Row():
+                shared.gradio['Impersonate'] = gr.Button('Impersonate')
+                shared.gradio['Regenerate'] = gr.Button('Regenerate')
+            with gr.Row():
+                shared.gradio['Copy last reply'] = gr.Button('Copy last reply')
+                shared.gradio['Replace last reply'] = gr.Button('Replace last reply')
+                shared.gradio['Remove last'] = gr.Button('Remove last')
 
-            shared.gradio['Clear history'] = gr.Button('Clear history')
-            shared.gradio['Clear history-confirm'] = gr.Button('Confirm', variant="stop", visible=False)
-            shared.gradio['Clear history-cancel'] = gr.Button('Cancel', visible=False)
-        with gr.Tab('Chat settings'):
+                shared.gradio['Clear history'] = gr.Button('Clear history')
+                shared.gradio['Clear history-confirm'] = gr.Button('Confirm', variant="stop", visible=False)
+                shared.gradio['Clear history-cancel'] = gr.Button('Cancel', visible=False)
+
+            create_model_and_preset_menus()
+
+            with gr.Box():
+                with gr.Row():
+                    with gr.Column():
+                        shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], step=1, label='max_new_tokens', value=shared.settings['max_new_tokens'])
+                        shared.gradio['chat_prompt_size_slider'] = gr.Slider(minimum=shared.settings['chat_prompt_size_min'], maximum=shared.settings['chat_prompt_size_max'], step=1, label='Maximum prompt size in tokens', value=shared.settings['chat_prompt_size'])
+                    with gr.Column():
+                        shared.gradio['chat_generation_attempts'] = gr.Slider(minimum=shared.settings['chat_generation_attempts_min'], maximum=shared.settings['chat_generation_attempts_max'], value=shared.settings['chat_generation_attempts'], step=1, label='Generation attempts (for longer replies)')
+
+            if shared.args.extensions is not None:
+                extensions_module.create_extensions_block()
+
+        with gr.Tab("Chat settings", elem_id="chat-settings"):
             shared.gradio['name1'] = gr.Textbox(value=shared.settings[f'name1{suffix}'], lines=1, label='Your name')
             shared.gradio['name2'] = gr.Textbox(value=shared.settings[f'name2{suffix}'], lines=1, label='Bot\'s name')
             shared.gradio['context'] = gr.Textbox(value=shared.settings[f'context{suffix}'], lines=5, label='Context')
@@ -255,21 +275,11 @@ if shared.args.chat or shared.args.cai_chat:
                 with gr.Tab('Upload TavernAI Character Card'):
                     shared.gradio['upload_img_tavern'] = gr.File(type='binary', file_types=['image'])
 
-        with gr.Tab('Generation settings'):
-            with gr.Row():
-                with gr.Column():
-                    shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], step=1, label='max_new_tokens', value=shared.settings['max_new_tokens'])
-                with gr.Column():
-                    shared.gradio['chat_prompt_size_slider'] = gr.Slider(minimum=shared.settings['chat_prompt_size_min'], maximum=shared.settings['chat_prompt_size_max'], step=1, label='Maximum prompt size in tokens', value=shared.settings['chat_prompt_size'])
-                shared.gradio['chat_generation_attempts'] = gr.Slider(minimum=shared.settings['chat_generation_attempts_min'], maximum=shared.settings['chat_generation_attempts_max'], value=shared.settings['chat_generation_attempts'], step=1, label='Generation attempts (for longer replies)')
+        with gr.Tab("Settings", elem_id="settings"):
             create_settings_menus(default_preset)
 
-        shared.input_params = [shared.gradio[k] for k in ['textbox', 'max_new_tokens', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'name1', 'name2', 'context', 'check', 'chat_prompt_size_slider', 'chat_generation_attempts']]
-        if shared.args.extensions is not None:
-            with gr.Tab('Extensions'):
-                extensions_module.create_extensions_block()
-
         function_call = 'chat.cai_chatbot_wrapper' if shared.args.cai_chat else 'chat.chatbot_wrapper'
+        shared.input_params = [shared.gradio[k] for k in ['textbox', 'max_new_tokens', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'name1', 'name2', 'context', 'check', 'chat_prompt_size_slider', 'chat_generation_attempts']]
 
         gen_events.append(shared.gradio['Generate'].click(eval(function_call), shared.input_params, shared.gradio['display'], show_progress=shared.args.no_stream))
         gen_events.append(shared.gradio['textbox'].submit(eval(function_call), shared.input_params, shared.gradio['display'], show_progress=shared.args.no_stream))
@@ -310,58 +320,66 @@ if shared.args.chat or shared.args.cai_chat:
         shared.gradio['upload_img_me'].upload(reload_func, reload_inputs, [shared.gradio['display']])
         shared.gradio['Stop'].click(reload_func, reload_inputs, [shared.gradio['display']])
 
+        shared.gradio['interface'].load(None, None, None, _js=f"() => {{{ui.page_js}}}")
         shared.gradio['interface'].load(lambda : chat.load_default_history(shared.settings[f'name1{suffix}'], shared.settings[f'name2{suffix}']), None, None)
         shared.gradio['interface'].load(reload_func, reload_inputs, [shared.gradio['display']], show_progress=True)
 
 elif shared.args.notebook:
     with gr.Blocks(css=ui.css, analytics_enabled=False, title=title) as shared.gradio['interface']:
-        gr.Markdown(description)
-        with gr.Tab('Raw'):
-            shared.gradio['textbox'] = gr.Textbox(value=default_text, lines=23)
-        with gr.Tab('Markdown'):
-            shared.gradio['markdown'] = gr.Markdown()
-        with gr.Tab('HTML'):
-            shared.gradio['html'] = gr.HTML()
+        with gr.Tab("Text generation", elem_id="main"):
+            with gr.Tab('Raw'):
+                shared.gradio['textbox'] = gr.Textbox(value=default_text, lines=25)
+            with gr.Tab('Markdown'):
+                shared.gradio['markdown'] = gr.Markdown()
+            with gr.Tab('HTML'):
+                shared.gradio['html'] = gr.HTML()
 
-        shared.gradio['Generate'] = gr.Button('Generate')
-        shared.gradio['Stop'] = gr.Button('Stop')
-        shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], step=1, label='max_new_tokens', value=shared.settings['max_new_tokens'])
+            with gr.Row():
+                shared.gradio['Stop'] = gr.Button('Stop')
+                shared.gradio['Generate'] = gr.Button('Generate')
+            shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], step=1, label='max_new_tokens', value=shared.settings['max_new_tokens'])
 
-        create_settings_menus(default_preset)
-        if shared.args.extensions is not None:
-            extensions_module.create_extensions_block()
+            create_model_and_preset_menus()
+            if shared.args.extensions is not None:
+                extensions_module.create_extensions_block()
+
+        with gr.Tab("Settings", elem_id="settings"):
+            create_settings_menus(default_preset)
 
         shared.input_params = [shared.gradio[k] for k in ['textbox', 'max_new_tokens', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping']]
         output_params = [shared.gradio[k] for k in ['textbox', 'markdown', 'html']]
         gen_events.append(shared.gradio['Generate'].click(generate_reply, shared.input_params, output_params, show_progress=shared.args.no_stream, api_name='textgen'))
         gen_events.append(shared.gradio['textbox'].submit(generate_reply, shared.input_params, output_params, show_progress=shared.args.no_stream))
         shared.gradio['Stop'].click(None, None, None, cancels=gen_events)
+        shared.gradio['interface'].load(None, None, None, _js=f"() => {{{ui.page_js}}}")
 
 else:
     with gr.Blocks(css=ui.css, analytics_enabled=False, title=title) as shared.gradio['interface']:
-        gr.Markdown(description)
-        with gr.Row():
-            with gr.Column():
-                shared.gradio['textbox'] = gr.Textbox(value=default_text, lines=15, label='Input')
-                shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], step=1, label='max_new_tokens', value=shared.settings['max_new_tokens'])
-                shared.gradio['Generate'] = gr.Button('Generate')
-                with gr.Row():
-                    with gr.Column():
-                        shared.gradio['Continue'] = gr.Button('Continue')
-                    with gr.Column():
-                        shared.gradio['Stop'] = gr.Button('Stop')
+        with gr.Tab("Text generation", elem_id="main"):
+            with gr.Row():
+                with gr.Column():
+                    shared.gradio['textbox'] = gr.Textbox(value=default_text, lines=15, label='Input')
+                    shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], step=1, label='max_new_tokens', value=shared.settings['max_new_tokens'])
+                    shared.gradio['Generate'] = gr.Button('Generate')
+                    with gr.Row():
+                        with gr.Column():
+                            shared.gradio['Continue'] = gr.Button('Continue')
+                        with gr.Column():
+                            shared.gradio['Stop'] = gr.Button('Stop')
 
-                create_settings_menus(default_preset)
-                if shared.args.extensions is not None:
-                    extensions_module.create_extensions_block()
+                    create_model_and_preset_menus()
+                    if shared.args.extensions is not None:
+                        extensions_module.create_extensions_block()
 
-            with gr.Column():
-                with gr.Tab('Raw'):
-                    shared.gradio['output_textbox'] = gr.Textbox(lines=15, label='Output')
-                with gr.Tab('Markdown'):
-                    shared.gradio['markdown'] = gr.Markdown()
-                with gr.Tab('HTML'):
-                    shared.gradio['html'] = gr.HTML()
+                with gr.Column():
+                    with gr.Tab('Raw'):
+                        shared.gradio['output_textbox'] = gr.Textbox(lines=25, label='Output')
+                    with gr.Tab('Markdown'):
+                        shared.gradio['markdown'] = gr.Markdown()
+                    with gr.Tab('HTML'):
+                        shared.gradio['html'] = gr.HTML()
+        with gr.Tab("Settings", elem_id="settings"):
+            create_settings_menus(default_preset)
 
         shared.input_params = [shared.gradio[k] for k in ['textbox', 'max_new_tokens', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping']]
         output_params = [shared.gradio[k] for k in ['output_textbox', 'markdown', 'html']]
@@ -369,6 +387,7 @@ else:
         gen_events.append(shared.gradio['textbox'].submit(generate_reply, shared.input_params, output_params, show_progress=shared.args.no_stream))
         gen_events.append(shared.gradio['Continue'].click(generate_reply, [shared.gradio['output_textbox']] + shared.input_params[1:], output_params, show_progress=shared.args.no_stream))
         shared.gradio['Stop'].click(None, None, None, cancels=gen_events)
+        shared.gradio['interface'].load(None, None, None, _js=f"() => {{{ui.page_js}}}")
 
 shared.gradio['interface'].queue()
 if shared.args.listen: