From e61316ce0b06433359997abbab1d966dad96d79b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 5 Jun 2023 11:52:13 -0300
Subject: [PATCH 1/8] Detect airoboros and Nous-Hermes

---
 models/config.yaml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/models/config.yaml b/models/config.yaml
index db48e5a5..59a2afbb 100644
--- a/models/config.yaml
+++ b/models/config.yaml
@@ -180,3 +180,9 @@ llama-65b-gptq-3bit:
 .*bluemoonrp-(30|13)b:
   mode: 'instruct'
   instruction_template: 'Bluemoon'
+.*Nous-Hermes-13b:
+  mode: 'instruct'
+  instruction_template: 'Alpaca'
+.*airoboros-13b-gpt4:
+  mode: 'instruct'
+  instruction_template: 'Vicuna-v1.1'

From 9b0e95abeb9e156a68665186a8651f2051cb2653 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 5 Jun 2023 11:56:03 -0300
Subject: [PATCH 2/8] Fix "regenerate" when "Start reply with" is set

---
 modules/chat.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/modules/chat.py b/modules/chat.py
index f3388737..63042f1f 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -325,6 +325,10 @@ def generate_chat_reply(text, history, state, regenerate=False, _continue=False,
 # Same as above but returns HTML for the UI
 def generate_chat_reply_wrapper(text, start_with, state, regenerate=False, _continue=False):
     if start_with != '' and _continue == False:
+        if regenerate == True:
+            text = remove_last_message()
+            regenerate = False
+
         _continue = True
         send_dummy_message(text)
         send_dummy_reply(start_with)

From 6a75bda4194fe16ddce804bd1a5af7602573a13d Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 5 Jun 2023 12:07:52 -0300
Subject: [PATCH 3/8] Assign some 4096 seq lengths

---
 models/config.yaml | 2 ++
 server.py          | 1 +
 2 files changed, 3 insertions(+)

diff --git a/models/config.yaml b/models/config.yaml
index 59a2afbb..2c092678 100644
--- a/models/config.yaml
+++ b/models/config.yaml
@@ -180,9 +180,11 @@ llama-65b-gptq-3bit:
 .*bluemoonrp-(30|13)b:
   mode: 'instruct'
   instruction_template: 'Bluemoon'
+  truncation_length: 4096
 .*Nous-Hermes-13b:
   mode: 'instruct'
   instruction_template: 'Alpaca'
 .*airoboros-13b-gpt4:
   mode: 'instruct'
   instruction_template: 'Vicuna-v1.1'
+  truncation_length: 4096
diff --git a/server.py b/server.py
index 1c8a5fe0..8bfb45aa 100644
--- a/server.py
+++ b/server.py
@@ -1049,6 +1049,7 @@ if __name__ == "__main__":
         'mode': shared.settings['mode'],
         'skip_special_tokens': shared.settings['skip_special_tokens'],
         'custom_stopping_strings': shared.settings['custom_stopping_strings'],
+        'truncation_length': shared.settings['truncation_length'],
     }
 
     shared.model_config.move_to_end('.*', last=False)  # Move to the beginning

From 632571a0095d3972e5832b81becc9cb6b6bba807 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 5 Jun 2023 15:16:06 -0300
Subject: [PATCH 4/8] Update README

---
 README.md                                     | 62 +++++++------------
 .../api-example-chat-stream.py                |  0
 .../api-example-chat.py                       |  0
 .../api-example-stream.py                     |  0
 api-example.py => api-examples/api-example.py |  0
 5 files changed, 23 insertions(+), 39 deletions(-)
 rename api-example-chat-stream.py => api-examples/api-example-chat-stream.py (100%)
 rename api-example-chat.py => api-examples/api-example-chat.py (100%)
 rename api-example-stream.py => api-examples/api-example-stream.py (100%)
 rename api-example.py => api-examples/api-example.py (100%)

diff --git a/README.md b/README.md
index e6ca1cae..cfb90c46 100644
--- a/README.md
+++ b/README.md
@@ -10,28 +10,23 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 
 ## Features
 
-* Dropdown menu for switching between models
-* Notebook mode that resembles OpenAI's playground
-* Chat mode for conversation and role-playing
-* Instruct mode compatible with various formats, including Alpaca, Vicuna, Open Assistant, Dolly, Koala, ChatGLM, MOSS, RWKV-Raven, Galactica, StableLM, WizardLM, Baize, Ziya, Chinese-Vicuna, MPT, INCITE, Wizard Mega, KoAlpaca, Vigogne, Bactrian, h2o, and OpenBuddy
+* 3 interface modes: default, notebook, and chat
+* Dropdown menu for quickly switching between different models
+* LoRA: load and unload LoRAs on the fly, load multiple LoRAs at the same time, train a new LoRA
+* Precise instruction templates in chat mode, including Alpaca, Vicuna, Open Assistant, Dolly, Koala, ChatGLM, MOSS, RWKV-Raven, Galactica, StableLM, WizardLM, Baize, Ziya, Chinese-Vicuna, MPT, INCITE, Wizard Mega, KoAlpaca, Vigogne, Bactrian, h2o, and OpenBuddy
 * [Multimodal pipelines, including LLaVA and MiniGPT-4](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/multimodal)
+* Multiple model backends: tranformers, llama.cpp, AutoGPTQ, GPTQ-for-LLaMa, RWKV, FlexGen
+* 8-bit and 4-bit inference through bitsandbytes
+* CPU mode for transformers models
+* [DeepSpeed ZeRO-3 inference](docs/DeepSpeed.md)
+* [Extension support](docs/Extensions.md)
+* [Custom chat characters](docs/Chat-mode.md)
+* Very efficient text streaming
 * Markdown output with LaTeX rendering, to use for instance with [GALACTICA](https://github.com/paperswithcode/galai)
 * Nice HTML output for GPT-4chan
-* [Custom chat characters](docs/Chat-mode.md)
-* Advanced chat features (send images, get audio responses with TTS)
-* Very efficient text streaming
-* Parameter presets
-* [LLaMA model](docs/LLaMA-model.md)
-* [4-bit GPTQ mode](docs/GPTQ-models-(4-bit-mode).md)
-* [LoRA (loading and training)](docs/Using-LoRAs.md)
-* [llama.cpp](docs/llama.cpp-models.md)
-* 8-bit and 4-bit through bitsandbytes
-* Layers splitting across GPU(s), CPU, and disk
-* CPU mode
-* [FlexGen](docs/FlexGen.md)
-* [DeepSpeed ZeRO-3](docs/DeepSpeed.md)
-* API [with](https://github.com/oobabooga/text-generation-webui/blob/main/api-example-stream.py) streaming and [without](https://github.com/oobabooga/text-generation-webui/blob/main/api-example.py) streaming
-* [Extensions](docs/Extensions.md) - see the [user extensions list](https://github.com/oobabooga/text-generation-webui-extensions)
+* API, including endpoints for websocket streaming ([see the examples](https://github.com/oobabooga/text-generation-webui/blob/main/api-examples))
+
+To learn how to use the various features, check out the Documentation: https://github.com/oobabooga/text-generation-webui/tree/main/docs
 
 ## Installation
 
@@ -95,14 +90,6 @@ cd text-generation-webui
 pip install -r requirements.txt
 ```
 
-#### 4. Install GPTQ
-
-The base installation covers [transformers](https://github.com/huggingface/transformers) models (`AutoModelForCausalLM` and `AutoModelForSeq2SeqLM` specifically) and [llama.cpp](https://github.com/ggerganov/llama.cpp) (GGML) models.
-
-To use GPTQ models, the additional installation steps below are necessary:
-
-[GPTQ models (4 bit mode)](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md)
-
 #### llama.cpp with GPU acceleration
 
 Requires the additional compilation step described here: [GPU acceleration](https://github.com/oobabooga/text-generation-webui/blob/main/docs/llama.cpp-models.md#gpu-acceleration).
@@ -154,9 +141,7 @@ For example:
 
     python download-model.py facebook/opt-1.3b
 
-* If you want to download a model manually, note that all you need are the json, txt, and pytorch\*.bin (or model*.safetensors) files. The remaining files are not necessary.
-
-* Set env vars `HF_USER` and `HF_PASS` to your Hugging Face username and password (or [User Access Token](https://huggingface.co/settings/tokens)) to download a protected model. The model's terms must first be accepted on the HF website.
+To download a protected model, set env vars `HF_USER` and `HF_PASS` to your Hugging Face username and password (or [User Access Token](https://huggingface.co/settings/tokens)). The model's terms must first be accepted on the HF website.
 
 #### GGML models
 
@@ -164,6 +149,10 @@ You can drop these directly into the `models/` folder, making sure that the file
 
 #### GPT-4chan
 
+<details>
+<summary>
+Download instructions
+</summary>
 [GPT-4chan](https://huggingface.co/ykilcher/gpt-4chan) has been shut down from Hugging Face, so you need to download it elsewhere. You have two options:
 
 * Torrent: [16-bit](https://archive.org/details/gpt4chan_model_float16) / [32-bit](https://archive.org/details/gpt4chan_model)
@@ -181,6 +170,9 @@ After downloading the model, follow these steps:
 python download-model.py EleutherAI/gpt-j-6B --text-only
 ```
 
+When you load this model in default or notebook modes, the "HTML" tab will show the generated text in 4chan format.
+</details>
+
 ## Starting the web UI
 
     conda activate textgen
@@ -331,15 +323,7 @@ Out of memory errors? [Check the low VRAM guide](docs/Low-VRAM-guide.md).
 
 Inference settings presets can be created under `presets/` as yaml files. These files are detected automatically at startup.
 
-By default, 10 presets based on NovelAI and KoboldAI presets are included. These were selected out of a sample of 43 presets after applying a K-Means clustering algorithm and selecting the elements closest to the average of each cluster.
-
-[Visualization](https://user-images.githubusercontent.com/112222186/228956352-1addbdb9-2456-465a-b51d-089f462cd385.png)
-
-## Documentation
-
-Make sure to check out the documentation for an in-depth guide on how to use the web UI.
-
-https://github.com/oobabooga/text-generation-webui/tree/main/docs
+By default, 10 presets based on NovelAI and KoboldAI presets are included. These were selected out of a sample of 43 presets after applying a K-Means clustering algorithm and selecting the elements closest to the average of each cluster: [tSNE visualization](https://user-images.githubusercontent.com/112222186/228956352-1addbdb9-2456-465a-b51d-089f462cd385.png).
 
 ## Contributing
 
diff --git a/api-example-chat-stream.py b/api-examples/api-example-chat-stream.py
similarity index 100%
rename from api-example-chat-stream.py
rename to api-examples/api-example-chat-stream.py
diff --git a/api-example-chat.py b/api-examples/api-example-chat.py
similarity index 100%
rename from api-example-chat.py
rename to api-examples/api-example-chat.py
diff --git a/api-example-stream.py b/api-examples/api-example-stream.py
similarity index 100%
rename from api-example-stream.py
rename to api-examples/api-example-stream.py
diff --git a/api-example.py b/api-examples/api-example.py
similarity index 100%
rename from api-example.py
rename to api-examples/api-example.py

From f276d88546a5a3ec9b3ddb2c71d0b24d46afd23f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 5 Jun 2023 15:41:48 -0300
Subject: [PATCH 5/8] Use AutoGPTQ by default for GPTQ models

---
 README.md         | 18 +++++++++---------
 modules/models.py |  6 +++---
 modules/shared.py | 12 +++++-------
 modules/ui.py     |  2 +-
 server.py         |  2 +-
 5 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/README.md b/README.md
index cfb90c46..b8ea025f 100644
--- a/README.md
+++ b/README.md
@@ -244,10 +244,18 @@ Optionally, you can use the following command-line flags:
 | `--n_ctx N_CTX` | Size of the prompt context. |
 | `--llama_cpp_seed SEED` | Seed for llama-cpp models. Default 0 (random). |
 
-#### GPTQ
+#### AutoGPTQ
+
+| Flag             | Description |
+|------------------|-------------|
+| `--triton`       | Use triton. |
+| `--desc_act`     | For models that don't have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig. |
+
+#### GPTQ-for-LLaMa
 
 | Flag                      | Description |
 |---------------------------|-------------|
+| `--gptq-for-llama` | Use GPTQ-for-LLaMa to load the GPTQ model instead of AutoGPTQ. |
 | `--wbits WBITS`           | Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported. |
 | `--model_type MODEL_TYPE` | Model type of pre-quantized model. Currently LLaMA, OPT, and GPT-J are supported. |
 | `--groupsize GROUPSIZE`   | Group size. |
@@ -258,14 +266,6 @@ Optionally, you can use the following command-line flags:
 | `--warmup_autotune`    | (triton) Enable warmup autotune. |
 | `--fused_mlp`          | (triton) Enable fused mlp. |
 
-#### AutoGPTQ
-
-| Flag             | Description |
-|------------------|-------------|
-| `--autogptq`     | Use AutoGPTQ for loading quantized models instead of the internal GPTQ loader. |
-| `--triton`       | Use triton. |
-|` --desc_act`     | For models that don't have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig. |
-
 #### FlexGen
 
 | Flag             | Description |
diff --git a/modules/models.py b/modules/models.py
index 575f28e1..3972133a 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -81,10 +81,10 @@ def load_model(model_name):
         logger.error('The path to the model does not exist. Exiting.')
         return None, None
 
-    if shared.args.autogptq:
-        load_func = AutoGPTQ_loader
-    elif shared.args.wbits > 0:
+    if shared.args.gptq_for_llama:
         load_func = GPTQ_loader
+    elif Path(f'{shared.args.model_dir}/{model_name}/quantize_config.json').exists() or shared.args.wbits > 0:
+        load_func = AutoGPTQ_loader
     elif shared.model_type == 'llamacpp':
         load_func = llamacpp_loader
     elif shared.model_type == 'rwkv':
diff --git a/modules/shared.py b/modules/shared.py
index 9a025587..d57efef4 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -141,7 +141,8 @@ parser.add_argument('--warmup_autotune', action='store_true', help='(triton) Ena
 parser.add_argument('--fused_mlp', action='store_true', help='(triton) Enable fused mlp.')
 
 # AutoGPTQ
-parser.add_argument('--autogptq', action='store_true', help='Use AutoGPTQ for loading quantized models instead of the internal GPTQ loader.')
+parser.add_argument('--gptq-for-llama', action='store_true', help='Use GPTQ-for-LLaMa to load the GPTQ model instead of AutoGPTQ.')
+parser.add_argument('--autogptq', action='store_true', help='DEPRECATED')
 parser.add_argument('--triton', action='store_true', help='Use triton.')
 parser.add_argument('--desc_act', action='store_true', help='For models that don\'t have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig.')
 
@@ -181,12 +182,9 @@ parser.add_argument('--multimodal-pipeline', type=str, default=None, help='The m
 args = parser.parse_args()
 args_defaults = parser.parse_args([])
 
-# Deprecation warnings for parameters that have been renamed
-deprecated_dict = {}
-for k in deprecated_dict:
-    if getattr(args, k) != deprecated_dict[k][1]:
-        logger.warning(f"--{k} is deprecated and will be removed. Use --{deprecated_dict[k][0]} instead.")
-        setattr(args, deprecated_dict[k][0], getattr(args, k))
+# Deprecation warnings
+if args.autogptq:
+    logger.warning('--autogptq has been deprecated and will be removed soon. AutoGPTQ is now used by default for GPTQ models.')
 
 # Security warnings
 if args.trust_remote_code:
diff --git a/modules/ui.py b/modules/ui.py
index 62796032..a10edec2 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -30,7 +30,7 @@ theme = gr.themes.Default(
 
 
 def list_model_elements():
-    elements = ['cpu_memory', 'auto_devices', 'disk', 'cpu', 'bf16', 'load_in_8bit', 'trust_remote_code', 'load_in_4bit', 'compute_dtype', 'quant_type', 'use_double_quant', 'wbits', 'groupsize', 'model_type', 'pre_layer', 'autogptq', 'triton', 'desc_act', 'threads', 'n_batch', 'no_mmap', 'mlock', 'n_gpu_layers', 'n_ctx', 'llama_cpp_seed']
+    elements = ['cpu_memory', 'auto_devices', 'disk', 'cpu', 'bf16', 'load_in_8bit', 'trust_remote_code', 'load_in_4bit', 'compute_dtype', 'quant_type', 'use_double_quant', 'gptq_for_llama', 'wbits', 'groupsize', 'model_type', 'pre_layer', 'triton', 'desc_act', 'threads', 'n_batch', 'no_mmap', 'mlock', 'n_gpu_layers', 'n_ctx', 'llama_cpp_seed']
     for i in range(torch.cuda.device_count()):
         elements.append(f'gpu_memory_{i}')
 
diff --git a/server.py b/server.py
index 8bfb45aa..ce7086a5 100644
--- a/server.py
+++ b/server.py
@@ -393,12 +393,12 @@ def create_model_menus():
                 with gr.Row():
                     with gr.Column():
                         gr.Markdown('AutoGPTQ')
-                        shared.gradio['autogptq'] = gr.Checkbox(label="autogptq", value=shared.args.autogptq, info='Activate AutoGPTQ loader. gpu-memory should be used for CPU offloading instead of pre_layer.')
                         shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)
                         shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.')
 
                     with gr.Column():
                         gr.Markdown('GPTQ-for-LLaMa')
+                        shared.gradio['gptq_for_llama'] = gr.Checkbox(label="gptq-for-llama", value=shared.args.gptq_for_llama, info='Use GPTQ-for-LLaMa to load the GPTQ model instead of AutoGPTQ. pre_layer should be used for CPU offloading instead of gpu-memory.')
                         with gr.Row():
                             shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=shared.args.wbits if shared.args.wbits > 0 else "None")
                             shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=shared.args.groupsize if shared.args.groupsize > 0 else "None")

From 99d701994a09289e4db270cd92e3ea18ed3f16ec Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 5 Jun 2023 15:55:00 -0300
Subject: [PATCH 6/8] Update GPTQ-models-(4-bit-mode).md

---
 docs/GPTQ-models-(4-bit-mode).md | 113 +++++++++++++++----------------
 1 file changed, 56 insertions(+), 57 deletions(-)

diff --git a/docs/GPTQ-models-(4-bit-mode).md b/docs/GPTQ-models-(4-bit-mode).md
index f7148403..63a6ed5b 100644
--- a/docs/GPTQ-models-(4-bit-mode).md
+++ b/docs/GPTQ-models-(4-bit-mode).md
@@ -6,14 +6,6 @@ GPTQ is a clever quantization algorithm that lightly reoptimizes the weights dur
 
 There are two ways of loading GPTQ models in the web UI at the moment:
 
-* Using GPTQ-for-LLaMa directly:
-  * faster CPU offloading
-  * faster multi-GPU inference
-  * supports loading LoRAs using a monkey patch
-  * included by default in the one-click installers
-  * requires you to manually figure out the wbits/groupsize/model_type parameters for the model to be able to load it
-  * supports either only cuda or only triton depending on the branch
-
 * Using AutoGPTQ:
   * supports more models
   * standardized (no need to guess any parameter)
@@ -21,8 +13,59 @@ There are two ways of loading GPTQ models in the web UI at the moment:
   * ~no wheels are presently available so it requires manual compilation~
   * supports loading both triton and cuda models
 
+* Using GPTQ-for-LLaMa directly:
+  * faster CPU offloading
+  * faster multi-GPU inference
+  * supports loading LoRAs using a monkey patch
+  * requires you to manually figure out the wbits/groupsize/model_type parameters for the model to be able to load it
+  * supports either only cuda or only triton depending on the branch
+
 For creating new quantizations, I recommend using AutoGPTQ: https://github.com/PanQiWei/AutoGPTQ
 
+## AutoGPTQ
+
+### Installation
+
+No additional steps are necessary as AutoGPTQ is already in the `requirements.txt` for the webui. If you still want or need to install it manually for whatever reason, these are the commands:
+
+```
+conda activate textgen
+git clone https://github.com/PanQiWei/AutoGPTQ.git && cd AutoGPTQ
+pip install .
+```
+
+The last command requires `nvcc` to be installed (see the [instructions above](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#step-1-install-nvcc)).
+
+### Usage
+
+When you quantize a model using AutoGPTQ, a folder containing a filed called `quantize_config.json` will be generated. Place that folder inside your `models/` folder and load it with the `--autogptq` flag:
+
+```
+python server.py --autogptq --model model_name
+```
+
+Alternatively, check the `autogptq` box in the "Model" tab of the UI before loading the model.
+
+### Offloading
+
+In order to do CPU offloading or multi-gpu inference with AutoGPTQ, use the `--gpu-memory` flag. It is currently somewhat slower than offloading with the `--pre_layer` option in GPTQ-for-LLaMA.
+
+For CPU offloading:
+
+```
+python server.py --autogptq --gpu-memory 3000MiB --model model_name
+```
+
+For multi-GPU inference:
+
+```
+python server.py --autogptq --gpu-memory 3000MiB 6000MiB --model model_name
+```
+
+### Using LoRAs with AutoGPTQ
+
+Not supported yet.
+
 ## GPTQ-for-LLaMa
 
 GPTQ-for-LLaMa is the original adaptation of GPTQ for the LLaMA model. It was made possible by [@qwopqwop200](https://github.com/qwopqwop200/GPTQ-for-LLaMa): https://github.com/qwopqwop200/GPTQ-for-LLaMa
@@ -108,23 +151,21 @@ These are models that you can simply download and place in your `models` folder.
 
 ### Starting the web UI:
 
+Use the `--gptq-for-llama` flag.
+
 For the models converted without `group-size`:
 
 ```
-python server.py --model llama-7b-4bit 
+python server.py --model llama-7b-4bit --gptq-for-llama 
 ```
 
 For the models converted with `group-size`:
 
 ```
-python server.py --model llama-13b-4bit-128g 
+python server.py --model llama-13b-4bit-128g  --gptq-for-llama --wbits 4 --groupsize 128
 ```
 
-The command-line flags `--wbits` and `--groupsize` are automatically detected based on the folder names, but you can also specify them manually like 
-
-```
-python server.py --model llama-13b-4bit-128g --wbits 4 --groupsize 128
-```
+The command-line flags `--wbits` and `--groupsize` are automatically detected based on the folder names in many cases.
 
 ### CPU offloading
 
@@ -171,46 +212,4 @@ pip install git+https://github.com/sterlind/GPTQ-for-LLaMa.git@lora_4bit
 python server.py --model llama-7b-4bit-128g --listen --lora tloen_alpaca-lora-7b --monkey-patch
 ```
 
-## AutoGPTQ
 
-### Installation
-
-No additional steps are necessary as AutoGPTQ is already in the `requirements.txt` for the webui. If you still want or need to install it manually for whatever reason, these are the commands:
-
-```
-conda activate textgen
-git clone https://github.com/PanQiWei/AutoGPTQ.git && cd AutoGPTQ
-pip install .
-```
-
-The last command requires `nvcc` to be installed (see the [instructions above](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#step-1-install-nvcc)).
-
-### Usage
-
-When you quantize a model using AutoGPTQ, a folder containing a filed called `quantize_config.json` will be generated. Place that folder inside your `models/` folder and load it with the `--autogptq` flag:
-
-```
-python server.py --autogptq --model model_name
-```
-
-Alternatively, check the `autogptq` box in the "Model" tab of the UI before loading the model.
-
-### Offloading
-
-In order to do CPU offloading or multi-gpu inference with AutoGPTQ, use the `--gpu-memory` flag. It is currently somewhat slower than offloading with the `--pre_layer` option in GPTQ-for-LLaMA.
-
-For CPU offloading:
-
-```
-python server.py --autogptq --gpu-memory 3000MiB --model model_name
-```
-
-For multi-GPU inference:
-
-```
-python server.py --autogptq --gpu-memory 3000MiB 6000MiB --model model_name
-```
-
-### Using LoRAs with AutoGPTQ
-
-Not supported yet.

From bef94b9ebb3d1af88eb6ad4403e2546b7d5464ce Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 5 Jun 2023 17:01:13 -0300
Subject: [PATCH 7/8] Update README

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b8ea025f..f1543598 100644
--- a/README.md
+++ b/README.md
@@ -151,8 +151,9 @@ You can drop these directly into the `models/` folder, making sure that the file
 
 <details>
 <summary>
-Download instructions
+Instructions
 </summary>
+
 [GPT-4chan](https://huggingface.co/ykilcher/gpt-4chan) has been shut down from Hugging Face, so you need to download it elsewhere. You have two options:
 
 * Torrent: [16-bit](https://archive.org/details/gpt4chan_model_float16) / [32-bit](https://archive.org/details/gpt4chan_model)

From eda224c92ddaec70654c8c82dbd9270cba2c440b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 5 Jun 2023 17:04:09 -0300
Subject: [PATCH 8/8] Update README

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index f1543598..6f50f62c 100644
--- a/README.md
+++ b/README.md
@@ -11,15 +11,15 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 ## Features
 
 * 3 interface modes: default, notebook, and chat
+* Multiple model backends: tranformers, llama.cpp, AutoGPTQ, GPTQ-for-LLaMa, RWKV, FlexGen
 * Dropdown menu for quickly switching between different models
 * LoRA: load and unload LoRAs on the fly, load multiple LoRAs at the same time, train a new LoRA
-* Precise instruction templates in chat mode, including Alpaca, Vicuna, Open Assistant, Dolly, Koala, ChatGLM, MOSS, RWKV-Raven, Galactica, StableLM, WizardLM, Baize, Ziya, Chinese-Vicuna, MPT, INCITE, Wizard Mega, KoAlpaca, Vigogne, Bactrian, h2o, and OpenBuddy
+* Precise instruction templates for chat mode, including Alpaca, Vicuna, Open Assistant, Dolly, Koala, ChatGLM, MOSS, RWKV-Raven, Galactica, StableLM, WizardLM, Baize, Ziya, Chinese-Vicuna, MPT, INCITE, Wizard Mega, KoAlpaca, Vigogne, Bactrian, h2o, and OpenBuddy
 * [Multimodal pipelines, including LLaVA and MiniGPT-4](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/multimodal)
-* Multiple model backends: tranformers, llama.cpp, AutoGPTQ, GPTQ-for-LLaMa, RWKV, FlexGen
 * 8-bit and 4-bit inference through bitsandbytes
 * CPU mode for transformers models
 * [DeepSpeed ZeRO-3 inference](docs/DeepSpeed.md)
-* [Extension support](docs/Extensions.md)
+* [Extensions](docs/Extensions.md)
 * [Custom chat characters](docs/Chat-mode.md)
 * Very efficient text streaming
 * Markdown output with LaTeX rendering, to use for instance with [GALACTICA](https://github.com/paperswithcode/galai)