From 8a98646a2182da3e661595d3f64da084092c5285 Mon Sep 17 00:00:00 2001
From: turboderp <11859846+turboderp@users.noreply.github.com>
Date: Fri, 6 Oct 2023 00:12:22 +0200
Subject: [PATCH] Bump ExLlamaV2 to 0.0.5 (#4186)

---
 modules/exllamav2.py             | 2 +-
 modules/loaders.py               | 2 ++
 requirements.txt                 | 6 +++---
 requirements_amd.txt             | 2 +-
 requirements_amd_noavx2.txt      | 2 +-
 requirements_apple_intel.txt     | 2 +-
 requirements_apple_silicon.txt   | 2 +-
 requirements_cpu_only.txt        | 2 +-
 requirements_cpu_only_noavx2.txt | 2 +-
 requirements_noavx2.txt          | 6 +++---
 requirements_nowheels.txt        | 2 +-
 11 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/modules/exllamav2.py b/modules/exllamav2.py
index eb8e160c..278d3943 100644
--- a/modules/exllamav2.py
+++ b/modules/exllamav2.py
@@ -110,7 +110,7 @@ class Exllamav2Model:
         has_leading_space = False
         for i in range(max_new_tokens):
             logits = self.model.forward(ids[:, -1:], self.cache, input_mask=None).float().cpu()
-            token, _ = ExLlamaV2Sampler.sample(logits, settings, ids, random.random())
+            token, _, _= ExLlamaV2Sampler.sample(logits, settings, ids, random.random(), self.tokenizer)
             ids = torch.cat([ids, token], dim=1)
 
             if i == 0 and self.tokenizer.tokenizer.IdToPiece(int(token)).startswith('▁'):
diff --git a/modules/loaders.py b/modules/loaders.py
index a9b30bb6..ab10e0a4 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -216,6 +216,7 @@ loaders_samplers = {
         'guidance_scale',
         'negative_prompt',
         'ban_eos_token',
+        'add_bos_token',
         'custom_token_bans',
         'auto_max_new_tokens',
     },
@@ -228,6 +229,7 @@ loaders_samplers = {
         'repetition_penalty_range',
         'seed',
         'ban_eos_token',
+        'add_bos_token',
         'custom_token_bans',
         'auto_max_new_tokens',
     },
diff --git a/requirements.txt b/requirements.txt
index 19881eff..1db6ea47 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,7 +8,7 @@ accelerate==0.23.*
 colorama
 datasets
 einops
-exllamav2==0.0.4; platform_system != "Darwin" and platform_machine != "x86_64"
+exllamav2==0.0.5; platform_system != "Darwin" and platform_machine != "x86_64"
 markdown
 numpy==1.24
 optimum==1.13.1
@@ -40,8 +40,8 @@ https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu
 https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/jllllll/exllama/releases/download/0.0.17/exllama-0.0.17+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
 https://github.com/jllllll/exllama/releases/download/0.0.17/exllama-0.0.17+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/turboderp/exllamav2/releases/download/v0.0.4/exllamav2-0.0.4+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
-https://github.com/turboderp/exllamav2/releases/download/v0.0.4/exllamav2-0.0.4+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/turboderp/exllamav2/releases/download/v0.0.5/exllamav2-0.0.5+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
+https://github.com/turboderp/exllamav2/releases/download/v0.0.5/exllamav2-0.0.5+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.0/flash_attn-2.3.0+cu117torch2.0cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
 https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu117-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements_amd.txt b/requirements_amd.txt
index 9723b583..0a015a95 100644
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@@ -8,7 +8,7 @@ accelerate==0.23.*
 colorama
 datasets
 einops
-exllamav2==0.0.4
+exllamav2==0.0.5
 markdown
 numpy==1.24
 optimum==1.13.1
diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt
index 112d411d..fb1acd0a 100644
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@@ -8,7 +8,7 @@ accelerate==0.23.*
 colorama
 datasets
 einops
-exllamav2==0.0.4
+exllamav2==0.0.5
 markdown
 numpy==1.24
 optimum==1.13.1
diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt
index 4d1bbc00..a84095c2 100644
--- a/requirements_apple_intel.txt
+++ b/requirements_apple_intel.txt
@@ -8,7 +8,7 @@ accelerate==0.23.*
 colorama
 datasets
 einops
-exllamav2==0.0.4
+exllamav2==0.0.5
 markdown
 numpy==1.24
 optimum==1.13.1
diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt
index 6187e72f..ba39cdbb 100644
--- a/requirements_apple_silicon.txt
+++ b/requirements_apple_silicon.txt
@@ -8,7 +8,7 @@ accelerate==0.23.*
 colorama
 datasets
 einops
-exllamav2==0.0.4
+exllamav2==0.0.5
 markdown
 numpy==1.24
 optimum==1.13.1
diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt
index cc040ebb..d29bc615 100644
--- a/requirements_cpu_only.txt
+++ b/requirements_cpu_only.txt
@@ -8,7 +8,7 @@ accelerate==0.23.*
 colorama
 datasets
 einops
-exllamav2==0.0.4
+exllamav2==0.0.5
 markdown
 numpy==1.24
 optimum==1.13.1
diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt
index eecb24c9..ee4f7c56 100644
--- a/requirements_cpu_only_noavx2.txt
+++ b/requirements_cpu_only_noavx2.txt
@@ -8,7 +8,7 @@ accelerate==0.23.*
 colorama
 datasets
 einops
-exllamav2==0.0.4
+exllamav2==0.0.5
 markdown
 numpy==1.24
 optimum==1.13.1
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index bc2e2451..f74d05f3 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -8,7 +8,7 @@ accelerate==0.23.*
 colorama
 datasets
 einops
-exllamav2==0.0.4; platform_system != "Darwin" and platform_machine != "x86_64"
+exllamav2==0.0.5; platform_system != "Darwin" and platform_machine != "x86_64"
 markdown
 numpy==1.24
 optimum==1.13.1
@@ -40,8 +40,8 @@ https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu
 https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/jllllll/exllama/releases/download/0.0.17/exllama-0.0.17+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
 https://github.com/jllllll/exllama/releases/download/0.0.17/exllama-0.0.17+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/turboderp/exllamav2/releases/download/v0.0.4/exllamav2-0.0.4+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
-https://github.com/turboderp/exllamav2/releases/download/v0.0.4/exllamav2-0.0.4+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/turboderp/exllamav2/releases/download/v0.0.5/exllamav2-0.0.5+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
+https://github.com/turboderp/exllamav2/releases/download/v0.0.5/exllamav2-0.0.5+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.0/flash_attn-2.3.0+cu117torch2.0cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
 https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu117-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements_nowheels.txt b/requirements_nowheels.txt
index 4de87c6c..9101b9aa 100644
--- a/requirements_nowheels.txt
+++ b/requirements_nowheels.txt
@@ -8,7 +8,7 @@ accelerate==0.23.*
 colorama
 datasets
 einops
-exllamav2==0.0.4
+exllamav2==0.0.5
 markdown
 numpy==1.24
 optimum==1.13.1