From 913e060348a4ce71f15daba8911ad5836c99bdb6 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 19 Jul 2023 08:24:37 -0700
Subject: [PATCH 1/3] Change the default preset to Divine Intellect

It seems to reduce hallucination while using instruction-tuned models.
---
 modules/shared.py      | 2 +-
 settings-template.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/shared.py b/modules/shared.py
index 08d88ff5..d805d30f 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -61,7 +61,7 @@ settings = {
     'chat_generation_attempts_max': 10,
     'default_extensions': [],
     'chat_default_extensions': ['gallery'],
-    'preset': 'simple-1',
+    'preset': 'Divine Intellect',
     'prompt': 'QA',
 }
 
diff --git a/settings-template.yaml b/settings-template.yaml
index ef9a7e7e..de2c73d3 100644
--- a/settings-template.yaml
+++ b/settings-template.yaml
@@ -36,5 +36,5 @@ chat_generation_attempts_max: 10
 default_extensions: []
 chat_default_extensions:
 - gallery
-preset: simple-1
+preset: 'Divine Intellect'
 prompt: QA

From 4b19b74e6c8d9c99634e16774d3ebcb618ba7a18 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 19 Jul 2023 19:31:19 -0700
Subject: [PATCH 2/3] Add CUDA wheels for llama-cpp-python by jllllll

---
 modules/llamacpp_hf.py    |  5 ++++-
 modules/llamacpp_model.py |  7 ++++++-
 requirements.txt          | 14 +++++++++-----
 3 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py
index 94d893c4..e09c1a74 100644
--- a/modules/llamacpp_hf.py
+++ b/modules/llamacpp_hf.py
@@ -3,7 +3,6 @@ from pathlib import Path
 from typing import Any, Dict, Optional, Union
 
 import torch
-from llama_cpp import Llama
 from torch.nn import CrossEntropyLoss
 from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel
 from transformers.modeling_outputs import CausalLMOutputWithPast
@@ -11,6 +10,10 @@ from transformers.modeling_outputs import CausalLMOutputWithPast
 from modules import shared
 from modules.logging_colors import logger
 
+if torch.cuda.is_available():
+    from llama_cpp_cuda import Llama
+else:
+    from llama_cpp import Llama
 
 class LlamacppHF(PreTrainedModel):
     def __init__(self, model):
diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py
index 180b0f37..c6e6ec54 100644
--- a/modules/llamacpp_model.py
+++ b/modules/llamacpp_model.py
@@ -9,12 +9,17 @@ https://abetlen.github.io/llama-cpp-python/
 import re
 from functools import partial
 
-from llama_cpp import Llama, LlamaCache, LogitsProcessorList
+import torch
 
 from modules import shared
 from modules.callbacks import Iteratorize
 from modules.logging_colors import logger
 
+if torch.cuda.is_available():
+    from llama_cpp_cuda import Llama, LlamaCache, LogitsProcessorList
+else:
+    from llama_cpp import Llama, LlamaCache, LogitsProcessorList
+
 
 def ban_eos_logits_processor(eos_token, input_ids, logits):
     logits[eos_token] = -float('inf')
diff --git a/requirements.txt b/requirements.txt
index 6382ea43..a2f0811d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,18 +13,22 @@ Pillow>=9.5.0
 pyyaml
 requests
 safetensors==0.3.1
-sentencepiece
-tqdm
 scipy
+sentencepiece
 tensorboard
-wandb
 transformers==4.31.*
+tqdm
+wandb
 git+https://github.com/huggingface/peft@03eb378eb914fbee709ff7c86ba5b1d033b89524
 bitsandbytes==0.40.2; platform_system != "Windows"
 https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.40.2-py3-none-win_amd64.whl; platform_system == "Windows"
-llama-cpp-python==0.1.73; platform_system != "Windows"
-https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.73/llama_cpp_python-0.1.73-cp310-cp310-win_amd64.whl; platform_system == "Windows"
 https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.3.0/auto_gptq-0.3.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
 https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.3.0/auto_gptq-0.3.0+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/jllllll/exllama/releases/download/0.0.6/exllama-0.0.6+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
 https://github.com/jllllll/exllama/releases/download/0.0.6/exllama-0.0.6+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+# llama-cpp-python without GPU support
+llama-cpp-python==0.1.73; platform_system != "Windows"
+https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.73/llama_cpp_python-0.1.73-cp310-cp310-win_amd64.whl; platform_system == "Windows"
+# llama-cpp-python with CUDA support
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.1.73+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.1.73+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

From 6415cc68a27e32d5b08f92f34d98b91864804f2f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 19 Jul 2023 21:20:40 -0700
Subject: [PATCH 3/3] Remove obsolete information from README

---
 README.md | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/README.md b/README.md
index 6cba1b31..225ed6c4 100644
--- a/README.md
+++ b/README.md
@@ -90,10 +90,6 @@ cd text-generation-webui
 pip install -r requirements.txt
 ```
 
-#### llama.cpp with GPU acceleration
-
-Requires the additional compilation step described here: [GPU acceleration](https://github.com/oobabooga/text-generation-webui/blob/main/docs/llama.cpp-models.md#gpu-acceleration).
-
 #### bitsandbytes
 
 bitsandbytes >= 0.39 may not work on older NVIDIA GPUs. In that case, to use `--load-in-8bit`, you may have to downgrade like this: