From c333e4c906800c0216cd11f9118d6237b071b376 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 9 Jun 2023 00:45:49 -0300
Subject: [PATCH] Add docs for performance optimizations

---
 docs/Performance-optimizations.md | 48 +++++++++++++++++++++++++++++++
 docs/README.md                    |  1 +
 requirements-minimal.txt          | 22 ++++++++++++++
 3 files changed, 71 insertions(+)
 create mode 100644 docs/Performance-optimizations.md
 create mode 100644 requirements-minimal.txt

diff --git a/docs/Performance-optimizations.md b/docs/Performance-optimizations.md
new file mode 100644
index 00000000..c5fae54e
--- /dev/null
+++ b/docs/Performance-optimizations.md
@@ -0,0 +1,48 @@
+# Performance optimizations
+
+In order to get the highest possible performance for your hardware, you can try compiling the following 3 backends manually instead of relying on the pre-compiled binaries that are part of `requirements.txt`:
+
+* AutoGPTQ (the default GPTQ loader)
+* GPTQ-for-LLaMa (secondary GPTQ loader)
+* llama-cpp-python
+
+If you go this route, you should update the Python requirements for the webui in the future with
+
+```
+pip install -r requirements-minimal.txt --upgrade
+```
+
+and then install the up-to-date backends using the commands below. The file `requirements-minimal.txt` contains the all requirements except for the pre-compiled wheels for GPTQ and llama-cpp-python.
+
+## AutoGPTQ
+
+```
+conda activate textgen
+pip uninstall auto-gptq -i
+git clone https://github.com/PanQiWei/AutoGPTQ.git && cd AutoGPTQ
+pip install .
+```
+
+## GPTQ-for-LLaMa
+
+```
+conda activate textgen
+pip uninstall quant-cuda -y
+cd text-generation-webui/repositories
+rm -r GPTQ-for-LLaMa
+git clone https://github.com/oobabooga/GPTQ-for-LLaMa
+cd GPTQ-for-LLaMa
+python setup_cuda.py install
+```
+
+## llama-cpp-python
+
+If you do not have a GPU:
+
+```
+conda activate textgen
+pip uninstall -y llama-cpp-python
+pip install llama-cpp-python
+```
+
+If you have a GPU, use the commands here instead: [llama.cpp-models.md#gpu-acceleration](https://github.com/oobabooga/text-generation-webui/blob/main/docs/llama.cpp-models.md#gpu-acceleration)
diff --git a/docs/README.md b/docs/README.md
index 37c4fe37..72e816de 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -19,3 +19,4 @@
 * [WSL installation guide](WSL-installation-guide.md)
 * [Docker Compose](Docker.md)
 * [Audio notification](Audio-Notification.md)
+* [Performance optimizations](Performance-optimizations.md)
diff --git a/requirements-minimal.txt b/requirements-minimal.txt
new file mode 100644
index 00000000..148c96f0
--- /dev/null
+++ b/requirements-minimal.txt
@@ -0,0 +1,22 @@
+accelerate==0.20.3
+colorama
+datasets
+einops
+flexgen==0.1.7
+gradio_client==0.2.5
+gradio==3.33.1
+markdown
+numpy
+pandas
+Pillow>=9.5.0
+pyyaml
+requests
+safetensors==0.3.1
+sentencepiece
+tqdm
+scipy
+transformers==4.30.0
+git+https://github.com/huggingface/peft@e45529b149c7f91ec1d4d82a5a152ef56c56cb94
+bitsandbytes==0.39.0; platform_system != "Windows"
+https://github.com/jllllll/bitsandbytes-windows-webui/raw/main/bitsandbytes-0.39.0-py3-none-any.whl; platform_system == "Windows"
+llama-cpp-python==0.1.57; platform_system != "Windows"