From 498fec2c7c7df376007f264261acfcfcb76168ea Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 25 Jul 2024 15:09:31 -0700
Subject: [PATCH 01/41] UI: fix saving characters

---
 modules/ui_file_saving.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ui_file_saving.py b/modules/ui_file_saving.py
index 51fac69f..ac72c623 100644
--- a/modules/ui_file_saving.py
+++ b/modules/ui_file_saving.py
@@ -107,7 +107,7 @@ def handle_save_character_confirm_click(name2, greeting, context, character_pict
     try:
         chat.save_character(name2, greeting, context, character_picture, filename)
         available_characters = utils.get_available_characters()
-        output = gr.update(choices=available_characters, value=filename),
+        output = gr.update(choices=available_characters, value=filename)
     except Exception:
         output = gr.update()
         traceback.print_exc()

From ac30b004efd7b9e2511d0baf619e8025c745ba09 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 6 Sep 2024 18:38:39 -0700
Subject: [PATCH 02/41] Pin fastapi/pydantic requirement versions

---
 requirements.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 08b7d56d..51558a84 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,6 +5,7 @@ bitsandbytes==0.43.*
 colorama
 datasets
 einops
+fastapi==0.112.4
 gradio==4.26.*
 hqq==0.1.7.post3
 jinja2==3.1.4
@@ -17,6 +18,7 @@ pandas
 peft==0.12.*
 Pillow>=9.5.0
 psutil
+pydantic==2.8.2
 pyyaml
 requests
 rich
@@ -69,4 +71,4 @@ https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp310
 https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
\ No newline at end of file
+https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"

From f98431c7448381bfa4e859ace70e0379f6431018 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 6 Sep 2024 18:47:25 -0700
Subject: [PATCH 03/41] Apply the change to all requirements (oops)

---
 requirements_amd.txt             | 4 +++-
 requirements_amd_noavx2.txt      | 4 +++-
 requirements_apple_intel.txt     | 2 ++
 requirements_apple_silicon.txt   | 2 ++
 requirements_cpu_only.txt        | 2 ++
 requirements_cpu_only_noavx2.txt | 2 ++
 requirements_noavx2.txt          | 4 +++-
 requirements_nowheels.txt        | 2 ++
 8 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/requirements_amd.txt b/requirements_amd.txt
index 52e36510..e9dfb128 100644
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@@ -2,6 +2,7 @@ accelerate==0.33.*
 colorama
 datasets
 einops
+fastapi==0.112.4
 gradio==4.26.*
 hqq==0.1.7.post3
 jinja2==3.1.4
@@ -14,6 +15,7 @@ pandas
 peft==0.12.*
 Pillow>=9.5.0
 psutil
+pydantic==2.8.2
 pyyaml
 requests
 rich
@@ -46,4 +48,4 @@ https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8-
 https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7+rocm561-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7+rocm561-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
\ No newline at end of file
+https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7+rocm561-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt
index 18a81d04..69a350ad 100644
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@@ -2,6 +2,7 @@ accelerate==0.33.*
 colorama
 datasets
 einops
+fastapi==0.112.4
 gradio==4.26.*
 hqq==0.1.7.post3
 jinja2==3.1.4
@@ -14,6 +15,7 @@ pandas
 peft==0.12.*
 Pillow>=9.5.0
 psutil
+pydantic==2.8.2
 pyyaml
 requests
 rich
@@ -44,4 +46,4 @@ https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8-
 https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7+rocm561-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7+rocm561-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
\ No newline at end of file
+https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7+rocm561-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt
index af02904b..0326916e 100644
--- a/requirements_apple_intel.txt
+++ b/requirements_apple_intel.txt
@@ -2,6 +2,7 @@ accelerate==0.33.*
 colorama
 datasets
 einops
+fastapi==0.112.4
 gradio==4.26.*
 hqq==0.1.7.post3
 jinja2==3.1.4
@@ -14,6 +15,7 @@ pandas
 peft==0.12.*
 Pillow>=9.5.0
 psutil
+pydantic==2.8.2
 pyyaml
 requests
 rich
diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt
index 8cdd8519..70ee3d27 100644
--- a/requirements_apple_silicon.txt
+++ b/requirements_apple_silicon.txt
@@ -2,6 +2,7 @@ accelerate==0.33.*
 colorama
 datasets
 einops
+fastapi==0.112.4
 gradio==4.26.*
 hqq==0.1.7.post3
 jinja2==3.1.4
@@ -14,6 +15,7 @@ pandas
 peft==0.12.*
 Pillow>=9.5.0
 psutil
+pydantic==2.8.2
 pyyaml
 requests
 rich
diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt
index 807c182a..fe2d796c 100644
--- a/requirements_cpu_only.txt
+++ b/requirements_cpu_only.txt
@@ -2,6 +2,7 @@ accelerate==0.33.*
 colorama
 datasets
 einops
+fastapi==0.112.4
 gradio==4.26.*
 hqq==0.1.7.post3
 jinja2==3.1.4
@@ -14,6 +15,7 @@ pandas
 peft==0.12.*
 Pillow>=9.5.0
 psutil
+pydantic==2.8.2
 pyyaml
 requests
 rich
diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt
index e2a89936..982a12a1 100644
--- a/requirements_cpu_only_noavx2.txt
+++ b/requirements_cpu_only_noavx2.txt
@@ -2,6 +2,7 @@ accelerate==0.33.*
 colorama
 datasets
 einops
+fastapi==0.112.4
 gradio==4.26.*
 hqq==0.1.7.post3
 jinja2==3.1.4
@@ -14,6 +15,7 @@ pandas
 peft==0.12.*
 Pillow>=9.5.0
 psutil
+pydantic==2.8.2
 pyyaml
 requests
 rich
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index d22eb72c..fc253f5c 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -5,6 +5,7 @@ bitsandbytes==0.43.*
 colorama
 datasets
 einops
+fastapi==0.112.4
 gradio==4.26.*
 hqq==0.1.7.post3
 jinja2==3.1.4
@@ -17,6 +18,7 @@ pandas
 peft==0.12.*
 Pillow>=9.5.0
 psutil
+pydantic==2.8.2
 pyyaml
 requests
 rich
@@ -69,4 +71,4 @@ https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp310
 https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
\ No newline at end of file
+https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
diff --git a/requirements_nowheels.txt b/requirements_nowheels.txt
index ffb45fe3..ec18464d 100644
--- a/requirements_nowheels.txt
+++ b/requirements_nowheels.txt
@@ -2,6 +2,7 @@ accelerate==0.33.*
 colorama
 datasets
 einops
+fastapi==0.112.4
 gradio==4.26.*
 hqq==0.1.7.post3
 jinja2==3.1.4
@@ -14,6 +15,7 @@ pandas
 peft==0.12.*
 Pillow>=9.5.0
 psutil
+pydantic==2.8.2
 pyyaml
 requests
 rich

From c497a3237229db0de4a4704e829d75a45973c91f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 26 Sep 2024 11:55:51 -0700
Subject: [PATCH 04/41] Bump transformers to 4.45

---
 requirements.txt                 | 2 +-
 requirements_amd.txt             | 2 +-
 requirements_amd_noavx2.txt      | 2 +-
 requirements_apple_intel.txt     | 2 +-
 requirements_apple_silicon.txt   | 2 +-
 requirements_cpu_only.txt        | 2 +-
 requirements_cpu_only_noavx2.txt | 2 +-
 requirements_noavx2.txt          | 2 +-
 requirements_nowheels.txt        | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 7164ac80..913c2463 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -26,7 +26,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.44.*
+transformers==4.45.*
 tqdm
 wandb
 
diff --git a/requirements_amd.txt b/requirements_amd.txt
index 20ba9c79..b9d74b69 100644
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@@ -23,7 +23,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.44.*
+transformers==4.45.*
 tqdm
 wandb
 
diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt
index e5c18681..ccacab29 100644
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@@ -23,7 +23,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.44.*
+transformers==4.45.*
 tqdm
 wandb
 
diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt
index e60937c7..dc1757e2 100644
--- a/requirements_apple_intel.txt
+++ b/requirements_apple_intel.txt
@@ -23,7 +23,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.44.*
+transformers==4.45.*
 tqdm
 wandb
 
diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt
index e1b726bb..a913a63c 100644
--- a/requirements_apple_silicon.txt
+++ b/requirements_apple_silicon.txt
@@ -23,7 +23,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.44.*
+transformers==4.45.*
 tqdm
 wandb
 
diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt
index b0480c43..b0dbc91a 100644
--- a/requirements_cpu_only.txt
+++ b/requirements_cpu_only.txt
@@ -23,7 +23,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.44.*
+transformers==4.45.*
 tqdm
 wandb
 
diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt
index a64d68a7..828d0c97 100644
--- a/requirements_cpu_only_noavx2.txt
+++ b/requirements_cpu_only_noavx2.txt
@@ -23,7 +23,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.44.*
+transformers==4.45.*
 tqdm
 wandb
 
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index cae98bb2..458c1410 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -26,7 +26,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.44.*
+transformers==4.45.*
 tqdm
 wandb
 
diff --git a/requirements_nowheels.txt b/requirements_nowheels.txt
index ec18464d..e7cbc29c 100644
--- a/requirements_nowheels.txt
+++ b/requirements_nowheels.txt
@@ -23,7 +23,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.44.*
+transformers==4.45.*
 tqdm
 wandb
 

From 7424f789bf699c9b2efc874da2b84168bb9e7a6f Mon Sep 17 00:00:00 2001
From: oobabooga <oobabooga4@gmail.com>
Date: Fri, 27 Sep 2024 19:03:25 -0300
Subject: [PATCH 05/41] Fix the sampling monkey patch (and add more options to
 sampler_priority) (#6411)

---
 modules/presets.py        |   2 +-
 modules/sampler_hijack.py | 192 +++++++++++++++++++++++---------------
 2 files changed, 116 insertions(+), 78 deletions(-)

diff --git a/modules/presets.py b/modules/presets.py
index b00e829e..14c5a777 100644
--- a/modules/presets.py
+++ b/modules/presets.py
@@ -44,7 +44,7 @@ def default_preset():
         'dry_base': 1.75,
         'dry_allowed_length': 2,
         'dry_sequence_breakers': '"\\n", ":", "\\"", "*"',
-        'sampler_priority': 'temperature\ndynamic_temperature\nquadratic_sampling\ntop_k\ntop_p\ntypical_p\nepsilon_cutoff\neta_cutoff\ntfs\ntop_a\nmin_p\nmirostat'
+        'sampler_priority': 'repetition_penalty\npresence_penalty\nfrequency_penalty\ntemperature\ndynamic_temperature\nquadratic_sampling\ntop_k\ntop_p\ntypical_p\nepsilon_cutoff\neta_cutoff\ntfs\ntop_a\nmin_p\nmirostat\nencoder_repetition_penalty\nno_repeat_ngram'
     }
 
 
diff --git a/modules/sampler_hijack.py b/modules/sampler_hijack.py
index 9fb661ae..9e865bb3 100644
--- a/modules/sampler_hijack.py
+++ b/modules/sampler_hijack.py
@@ -323,62 +323,141 @@ class SpyLogitsWarper(LogitsWarper):
 
 
 class RepetitionPenaltyLogitsProcessorWithRange(LogitsProcessor):
-    '''
-    Copied from the transformers library
-    '''
-
-    def __init__(self, penalty: float, presence_penalty: float, frequency_penalty: float, _range: int):
+    def __init__(self, penalty: float, _range: int):
         if not (penalty > 0):
             raise ValueError(f"`penalty` has to be strictly positive, but is {penalty}")
-
         self.penalty = penalty
-        self.presence_penalty = presence_penalty
-        self.frequency_penalty = frequency_penalty
         self._range = _range
 
+    def apply_repetition_penalty(self, input_ids_row, scores_row):
+        unique_ids = torch.unique(input_ids_row)
+        score = torch.gather(scores_row, 0, unique_ids)
+
+        # Apply multiplicative repetition penalty
+        score = torch.where(score < 0, score * self.penalty, score / self.penalty)
+        scores_row.scatter_(0, unique_ids, score)
+        return scores_row
+
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         input_ids = input_ids[:, -self._range:]
-
-        # We loop here because torch.unique() needs to process each row separately in the
-        # case that batch_size > 1.
         for input_ids_row, scores_row in zip(input_ids, scores):
-            unique_ids, counts = torch.unique(input_ids_row, return_counts=True)
-            score = torch.gather(scores_row, 0, unique_ids)
-
-            # multiplicative repetition penalty
-            # if score < 0 then repetition penalty has to be multiplied to reduce the previous token probability
-            score = torch.where(score < 0, score * self.penalty, score / self.penalty)
-            scores_row.scatter_(0, unique_ids, score)
-
-            # presence_penalty and frequency_penalty
-            raw_presence_penalty = (counts > 0).to(scores.dtype)
-            raw_frequency_penalty = counts.to(scores.dtype)
-            additive_penalty = raw_presence_penalty * self.presence_penalty + raw_frequency_penalty * self.frequency_penalty
-            scores_row.scatter_add_(0, unique_ids, -additive_penalty)
+            scores_row = self.apply_repetition_penalty(input_ids_row, scores_row)
 
         return scores
 
 
-def get_logits_warper_patch(self, generation_config, **kwargs):
+class PresencePenaltyLogitsProcessor(LogitsProcessor):
+    def __init__(self, presence_penalty: float, _range: int):
+        self.presence_penalty = presence_penalty
+        self._range = _range
+
+    def apply_presence_penalty(self, input_ids_row, scores_row):
+        unique_ids, counts = torch.unique(input_ids_row, return_counts=True)
+
+        # Apply presence penalty
+        raw_presence_penalty = (counts > 0).to(scores_row.dtype)
+        presence_penalty = raw_presence_penalty * self.presence_penalty
+        scores_row.scatter_add_(0, unique_ids, -presence_penalty)
+        return scores_row
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        input_ids = input_ids[:, -self._range:]
+        for input_ids_row, scores_row in zip(input_ids, scores):
+            scores_row = self.apply_presence_penalty(input_ids_row, scores_row)
+        return scores
+
+
+class FrequencyPenaltyLogitsProcessor(LogitsProcessor):
+    def __init__(self, frequency_penalty: float, _range: int):
+        self.frequency_penalty = frequency_penalty
+        self._range = _range
+
+    def apply_frequency_penalty(self, input_ids_row, scores_row):
+        unique_ids, counts = torch.unique(input_ids_row, return_counts=True)
+
+        # Apply frequency penalty
+        raw_frequency_penalty = counts.to(scores_row.dtype)
+        frequency_penalty = raw_frequency_penalty * self.frequency_penalty
+        scores_row.scatter_add_(0, unique_ids, -frequency_penalty)
+        return scores_row
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        input_ids = input_ids[:, -self._range:]
+        for input_ids_row, scores_row in zip(input_ids, scores):
+            scores_row = self.apply_frequency_penalty(input_ids_row, scores_row)
+        return scores
+
+
+def get_logits_processor_patch(self, **kwargs):
+    generation_config = kwargs['generation_config']
 
     # Parameter sanitization
     if isinstance(generation_config.temperature, int):
         generation_config.temperature = float(generation_config.temperature)  # Must be float
 
     # Get the original warpers
-    warpers = self._get_logits_warper_old(generation_config, **kwargs)
+    warpers = self._get_logits_processor_old(**kwargs)
 
-    # Replace temperature with our modified class.
-    # Currently, it behaves identically to the original.
-    for i in range(len(warpers)):
+    for i in range(len(warpers) - 1, -1, -1):
+        # Replace temperature with our modified class.
         if warpers[i].__class__.__name__ == 'TemperatureLogitsWarper':
             warpers[i] = TemperatureLogitsWarperCustom(
                 generation_config.temperature,
             )
 
+        # Stuff we don't need
+        elif warpers[i].__class__.__name__ in ['SuppressTokensLogitsProcessor', 'RepetitionPenaltyLogitsProcessor']:
+            del warpers[i]
+
     # Add custom warpers
     warpers_to_add = LogitsProcessorList()
     min_tokens_to_keep = 2 if generation_config.num_beams > 1 else 1
+
+    if generation_config.repetition_penalty is not None and generation_config.repetition_penalty != 1.0:
+        warpers_to_add.append(
+            RepetitionPenaltyLogitsProcessorWithRange(
+                penalty=generation_config.repetition_penalty,
+                _range=generation_config.repetition_penalty_range
+            )
+        )
+
+    if generation_config.presence_penalty is not None and generation_config.presence_penalty != 0.0:
+        warpers_to_add.append(
+            PresencePenaltyLogitsProcessor(
+                presence_penalty=generation_config.presence_penalty,
+                _range=generation_config.repetition_penalty_range
+            )
+        )
+
+    if generation_config.frequency_penalty is not None and generation_config.frequency_penalty != 0.0:
+        warpers_to_add.append(
+            FrequencyPenaltyLogitsProcessor(
+                frequency_penalty=generation_config.frequency_penalty,
+                _range=generation_config.repetition_penalty_range
+            )
+        )
+
+    if generation_config.dry_multiplier is not None and generation_config.dry_multiplier > 0.0:
+        dry_sequence_breakers = generation_config.dry_sequence_breakers
+
+        # Support both JSON array notation and comma-separated strings.
+        if not dry_sequence_breakers.startswith("["):
+            dry_sequence_breakers = "[" + dry_sequence_breakers + "]"
+
+        sequence_breaker_strings = json.loads(dry_sequence_breakers)
+        # Prefix with 'a' to get the correct encoding of the token at the end of a text.
+        sequence_breakers = {shared.tokenizer.encode(f'a{s}')[-1] for s in sequence_breaker_strings}
+
+        warpers.append(
+            DRYLogitsProcessor(
+                multiplier=generation_config.dry_multiplier,
+                base=generation_config.dry_base,
+                allowed_length=generation_config.dry_allowed_length,
+                sequence_breakers=sequence_breakers,
+                _range=generation_config.repetition_penalty_range,
+            )
+        )
+
     if generation_config.tfs is not None and 0.0 <= generation_config.tfs < 1.0:
         warpers_to_add.append(
             TailFreeLogitsWarper(
@@ -454,7 +533,12 @@ def get_logits_warper_patch(self, generation_config, **kwargs):
         'TopALogitsWarper': 'top_a',
         'TopKLogitsWarper': 'top_k',
         'TopPLogitsWarper': 'top_p',
-        'TypicalLogitsWarper': 'typical_p'
+        'TypicalLogitsWarper': 'typical_p',
+        'RepetitionPenaltyLogitsProcessorWithRange': 'repetition_penalty',
+        'PresencePenaltyLogitsProcessor': 'presence_penalty',
+        'FrequencyPenaltyLogitsProcessor': 'frequency_penalty',
+        'EncoderRepetitionPenaltyLogitsProcessor': 'encoder_repetition_penalty',
+        'NoRepeatNGramLogitsProcessor': 'no_repeat_ngram',
     }
 
     def custom_sort_key(obj):
@@ -482,49 +566,6 @@ def get_logits_warper_patch(self, generation_config, **kwargs):
     return warpers
 
 
-def get_logits_processor_patch(self, **kwargs):
-    generation_config = kwargs['generation_config']
-
-    do_rep_pen_hijack = (generation_config.repetition_penalty > 1) or (generation_config.presence_penalty != 0) or (generation_config.frequency_penalty != 0)
-    if do_rep_pen_hijack:
-        generation_config.repetition_penalty = 1.1  # Set to value > 1 to ensure RepetitionPenaltyLogitsProcessor is created
-
-    result = self._get_logits_processor_old(**kwargs)
-
-    if do_rep_pen_hijack:
-        for i in range(len(result)):
-            if result[i].__class__.__name__ == 'RepetitionPenaltyLogitsProcessor':
-                result[i] = RepetitionPenaltyLogitsProcessorWithRange(
-                    generation_config.repetition_penalty,
-                    generation_config.presence_penalty,
-                    generation_config.frequency_penalty,
-                    generation_config.repetition_penalty_range
-                )
-
-    if generation_config.dry_multiplier is not None and generation_config.dry_multiplier > 0.0:
-        dry_sequence_breakers = generation_config.dry_sequence_breakers
-
-        # Support both JSON array notation and comma-separated strings.
-        if not dry_sequence_breakers.startswith("["):
-            dry_sequence_breakers = "[" + dry_sequence_breakers + "]"
-
-        sequence_breaker_strings = json.loads(dry_sequence_breakers)
-        # Prefix with 'a' to get the correct encoding of the token at the end of a text.
-        sequence_breakers = {shared.tokenizer.encode(f'a{s}')[-1] for s in sequence_breaker_strings}
-
-        result.append(
-            DRYLogitsProcessor(
-                multiplier=generation_config.dry_multiplier,
-                base=generation_config.dry_base,
-                allowed_length=generation_config.dry_allowed_length,
-                sequence_breakers=sequence_breakers,
-                _range=generation_config.repetition_penalty_range,
-            )
-        )
-
-    return result
-
-
 def generation_config_init_patch(self, **kwargs):
     self.__init___old(**kwargs)
     self.min_p = kwargs.pop("min_p", 0.0)
@@ -547,13 +588,10 @@ def generation_config_init_patch(self, **kwargs):
     self.dry_allowed_length = kwargs.pop("dry_allowed_length", 2)
     self.dry_sequence_breakers = kwargs.pop("dry_sequence_breakers", '"\\n", ":", "\\"", "*"')
     self.temperature_last = kwargs.pop("temperature_last", False)
-    self.sampler_priority = kwargs.pop("sampler_priority", ['temperature', 'dynamic_temperature', 'quadratic_sampling', 'top_k', 'top_p', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'tfs', 'top_a', 'min_p', 'mirostat'])
+    self.sampler_priority = kwargs.pop("sampler_priority", ['repetition_penalty', 'presence_penalty', 'frequency_penalty', 'temperature', 'dynamic_temperature', 'quadratic_sampling', 'top_k', 'top_p', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'tfs', 'top_a', 'min_p', 'mirostat', 'encoder_repetition_penalty', 'no_repeat_ngram'])
 
 
 def hijack_samplers():
-    transformers.GenerationMixin._get_logits_warper_old = transformers.GenerationMixin._get_logits_warper
-    transformers.GenerationMixin._get_logits_warper = get_logits_warper_patch
-
     transformers.GenerationMixin._get_logits_processor_old = transformers.GenerationMixin._get_logits_processor
     transformers.GenerationMixin._get_logits_processor = get_logits_processor_patch
 

From c5f048e912ffbb1c7fd4b72609a67bec8ad07b5c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 27 Sep 2024 15:04:08 -0700
Subject: [PATCH 06/41] Bump ExLlamaV2 to 0.2.2

---
 requirements.txt               | 10 +++++-----
 requirements_amd.txt           |  6 +++---
 requirements_amd_noavx2.txt    |  6 +++---
 requirements_apple_intel.txt   |  2 +-
 requirements_apple_silicon.txt |  2 +-
 requirements_noavx2.txt        | 10 +++++-----
 6 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 913c2463..f0360d53 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -55,11 +55,11 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/te
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.90+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # CUDA wheels
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.0/exllamav2-0.2.0+cu121.torch2.2.2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.0/exllamav2-0.2.0+cu121.torch2.2.2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.0/exllamav2-0.2.0+cu121.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.0/exllamav2-0.2.0+cu121.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.0/exllamav2-0.2.0-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu121.torch2.2.2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu121.torch2.2.2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu121.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu121.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
 https://github.com/oobabooga/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu122torch2.2.2cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu122torch2.2.2cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
diff --git a/requirements_amd.txt b/requirements_amd.txt
index b9d74b69..d6bf29e6 100644
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@@ -42,9 +42,9 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cp
 # AMD wheels
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.90+rocm5.6.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.90+rocm5.6.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.0/exllamav2-0.2.0+rocm5.6.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.0/exllamav2-0.2.0+rocm5.6.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.0/exllamav2-0.2.0-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+rocm5.6.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+rocm5.6.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
 https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7+rocm561-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt
index ccacab29..5ce468c3 100644
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@@ -40,9 +40,9 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cp
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.90+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # AMD wheels
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.0/exllamav2-0.2.0+rocm5.6.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.0/exllamav2-0.2.0+rocm5.6.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.0/exllamav2-0.2.0-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+rocm5.6.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+rocm5.6.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
 https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7+rocm561-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt
index dc1757e2..12786ade 100644
--- a/requirements_apple_intel.txt
+++ b/requirements_apple_intel.txt
@@ -38,4 +38,4 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/me
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.90-cp310-cp310-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.90-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.90-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.0/exllamav2-0.2.0-py3-none-any.whl
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2-py3-none-any.whl
diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt
index a913a63c..901c9f10 100644
--- a/requirements_apple_silicon.txt
+++ b/requirements_apple_silicon.txt
@@ -40,4 +40,4 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/me
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.90-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10"
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.90-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.90-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.0/exllamav2-0.2.0-py3-none-any.whl
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2-py3-none-any.whl
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index 458c1410..87e1bd7f 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -55,11 +55,11 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/te
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.90+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # CUDA wheels
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.0/exllamav2-0.2.0+cu121.torch2.2.2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.0/exllamav2-0.2.0+cu121.torch2.2.2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.0/exllamav2-0.2.0+cu121.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.0/exllamav2-0.2.0+cu121.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.0/exllamav2-0.2.0-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu121.torch2.2.2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu121.torch2.2.2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu121.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu121.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
 https://github.com/oobabooga/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu122torch2.2.2cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu122torch2.2.2cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"

From 78b87054004a18f788d4a52d72a8ca9358aca0e9 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 27 Sep 2024 15:06:31 -0700
Subject: [PATCH 07/41] Bump llama-cpp-python to 0.3.0 (except for AMD)

---
 requirements.txt                 | 24 ++++++++++++------------
 requirements_amd.txt             |  8 ++++----
 requirements_amd_noavx2.txt      |  8 ++++----
 requirements_apple_intel.txt     |  8 ++++----
 requirements_apple_silicon.txt   | 12 ++++++------
 requirements_cpu_only.txt        |  8 ++++----
 requirements_cpu_only_noavx2.txt |  8 ++++----
 requirements_noavx2.txt          | 24 ++++++++++++------------
 8 files changed, 50 insertions(+), 50 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index f0360d53..1041025c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -37,22 +37,22 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.90+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.90+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.90+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.90+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, no tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.90+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.90+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.90+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.90+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.0+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.0+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.0+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.0+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.90+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.90+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.90+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.90+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.0+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.0+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.0+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.0+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # CUDA wheels
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu121.torch2.2.2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements_amd.txt b/requirements_amd.txt
index d6bf29e6..b224ce14 100644
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@@ -34,10 +34,10 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.90+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.90+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.90+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.90+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # AMD wheels
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.90+rocm5.6.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt
index 5ce468c3..3cdb8a7a 100644
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@@ -34,10 +34,10 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.90+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.90+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.90+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.90+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # AMD wheels
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+rocm5.6.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt
index 12786ade..fd91464d 100644
--- a/requirements_apple_intel.txt
+++ b/requirements_apple_intel.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.90-cp311-cp311-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.90-cp310-cp310-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.90-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.90-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.0-cp311-cp311-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.0-cp310-cp310-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.0-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.0-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2-py3-none-any.whl
diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt
index 901c9f10..8a50c8b4 100644
--- a/requirements_apple_silicon.txt
+++ b/requirements_apple_silicon.txt
@@ -34,10 +34,10 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.90-cp311-cp311-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.90-cp310-cp310-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.90-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.90-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.90-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.90-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.0-cp311-cp311-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.0-cp310-cp310-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.0-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.0-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.0-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.0-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2-py3-none-any.whl
diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt
index b0dbc91a..4e89b6ad 100644
--- a/requirements_cpu_only.txt
+++ b/requirements_cpu_only.txt
@@ -34,7 +34,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.90+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.90+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.90+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.90+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt
index 828d0c97..2c00ccf5 100644
--- a/requirements_cpu_only_noavx2.txt
+++ b/requirements_cpu_only_noavx2.txt
@@ -34,7 +34,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.90+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.90+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.90+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.90+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index 87e1bd7f..69d880ac 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -37,22 +37,22 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.90+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.90+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.90+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.90+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, no tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.90+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.90+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.90+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.90+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.0+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.0+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.0+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.0+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.90+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.90+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.90+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.90+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.0+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.0+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.0+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.0+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # CUDA wheels
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu121.torch2.2.2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"

From 5c918c5b2d870f41a5215cfd5c45c652ad6d9b3a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 27 Sep 2024 15:40:48 -0700
Subject: [PATCH 08/41] Make it possible to sort DRY

---
 modules/presets.py        | 2 +-
 modules/sampler_hijack.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/modules/presets.py b/modules/presets.py
index 14c5a777..06e2be70 100644
--- a/modules/presets.py
+++ b/modules/presets.py
@@ -44,7 +44,7 @@ def default_preset():
         'dry_base': 1.75,
         'dry_allowed_length': 2,
         'dry_sequence_breakers': '"\\n", ":", "\\"", "*"',
-        'sampler_priority': 'repetition_penalty\npresence_penalty\nfrequency_penalty\ntemperature\ndynamic_temperature\nquadratic_sampling\ntop_k\ntop_p\ntypical_p\nepsilon_cutoff\neta_cutoff\ntfs\ntop_a\nmin_p\nmirostat\nencoder_repetition_penalty\nno_repeat_ngram'
+        'sampler_priority': 'repetition_penalty\npresence_penalty\nfrequency_penalty\ndry\ntemperature\ndynamic_temperature\nquadratic_sampling\ntop_k\ntop_p\ntypical_p\nepsilon_cutoff\neta_cutoff\ntfs\ntop_a\nmin_p\nmirostat\nencoder_repetition_penalty\nno_repeat_ngram'
     }
 
 
diff --git a/modules/sampler_hijack.py b/modules/sampler_hijack.py
index 9e865bb3..65503c12 100644
--- a/modules/sampler_hijack.py
+++ b/modules/sampler_hijack.py
@@ -537,6 +537,7 @@ def get_logits_processor_patch(self, **kwargs):
         'RepetitionPenaltyLogitsProcessorWithRange': 'repetition_penalty',
         'PresencePenaltyLogitsProcessor': 'presence_penalty',
         'FrequencyPenaltyLogitsProcessor': 'frequency_penalty',
+        'DRYLogitsProcessor': 'dry',
         'EncoderRepetitionPenaltyLogitsProcessor': 'encoder_repetition_penalty',
         'NoRepeatNGramLogitsProcessor': 'no_repeat_ngram',
     }
@@ -588,7 +589,7 @@ def generation_config_init_patch(self, **kwargs):
     self.dry_allowed_length = kwargs.pop("dry_allowed_length", 2)
     self.dry_sequence_breakers = kwargs.pop("dry_sequence_breakers", '"\\n", ":", "\\"", "*"')
     self.temperature_last = kwargs.pop("temperature_last", False)
-    self.sampler_priority = kwargs.pop("sampler_priority", ['repetition_penalty', 'presence_penalty', 'frequency_penalty', 'temperature', 'dynamic_temperature', 'quadratic_sampling', 'top_k', 'top_p', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'tfs', 'top_a', 'min_p', 'mirostat', 'encoder_repetition_penalty', 'no_repeat_ngram'])
+    self.sampler_priority = kwargs.pop("sampler_priority", ['repetition_penalty', 'presence_penalty', 'frequency_penalty', 'dry', 'temperature', 'dynamic_temperature', 'quadratic_sampling', 'top_k', 'top_p', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'tfs', 'top_a', 'min_p', 'mirostat', 'encoder_repetition_penalty', 'no_repeat_ngram'])
 
 
 def hijack_samplers():

From 626b0a043702c9586478d9969184718d24ebf935 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thireus=20=E2=98=A0?= <Thireus@users.noreply.github.com>
Date: Fri, 27 Sep 2024 23:47:04 +0100
Subject: [PATCH 09/41] Force /bin/bash shell for conda (#6386)

---
 one_click.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/one_click.py b/one_click.py
index 0a0412ba..d35ecc89 100644
--- a/one_click.py
+++ b/one_click.py
@@ -190,7 +190,7 @@ def run_cmd(cmd, assert_success=False, environment=False, capture_output=False,
             cmd = f'. "{conda_sh_path}" && conda activate "{conda_env_path}" && {cmd}'
 
     # Run shell commands
-    result = subprocess.run(cmd, shell=True, capture_output=capture_output, env=env)
+    result = subprocess.run(cmd, shell=True, executable='/bin/bash', capture_output=capture_output, env=env)
 
     # Assert the command ran successfully
     if assert_success and result.returncode != 0:

From 3492e33fd50bcc78a81850fdcc995e097a79b8ef Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 27 Sep 2024 16:59:30 -0700
Subject: [PATCH 10/41] Bump bitsandbytes to 0.44

---
 requirements.txt        | 2 +-
 requirements_noavx2.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 1041025c..54bd414a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 accelerate==0.33.*
 aqlm[gpu,cpu]==1.1.6; platform_system == "Linux"
 auto-gptq==0.7.1
-bitsandbytes==0.43.*
+bitsandbytes==0.44.*
 colorama
 datasets
 einops
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index 69d880ac..39bfb0bd 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -1,7 +1,7 @@
 accelerate==0.33.*
 aqlm[gpu,cpu]==1.1.6; platform_system == "Linux"
 auto-gptq==0.7.1
-bitsandbytes==0.43.*
+bitsandbytes==0.44.*
 colorama
 datasets
 einops

From 301375834e55fcda6fda96332cad187aab2c6f77 Mon Sep 17 00:00:00 2001
From: Philipp Emanuel Weidmann <pew@worldwidemann.com>
Date: Sat, 28 Sep 2024 07:20:12 +0530
Subject: [PATCH 11/41] =?UTF-8?q?Exclude=20Top=20Choices=20(XTC):=20A=20sa?=
 =?UTF-8?q?mpler=20that=20boosts=20creativity,=20breaks=20writing=20clich?=
 =?UTF-8?q?=C3=A9s,=20and=20inhibits=20non-verbatim=20repetition=20(#6335)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 extensions/openai/typing.py |  2 ++
 modules/loaders.py          |  6 ++++
 modules/presets.py          |  4 ++-
 modules/sampler_hijack.py   | 61 ++++++++++++++++++++++++++++++++++++-
 modules/text_generation.py  |  2 +-
 modules/ui.py               |  2 ++
 modules/ui_parameters.py    |  4 +++
 7 files changed, 78 insertions(+), 3 deletions(-)

diff --git a/extensions/openai/typing.py b/extensions/openai/typing.py
index 4015f6a1..f63c1f39 100644
--- a/extensions/openai/typing.py
+++ b/extensions/openai/typing.py
@@ -37,6 +37,8 @@ class GenerationOptions(BaseModel):
     dry_base: float = 1.75
     dry_allowed_length: int = 2
     dry_sequence_breakers: str = '"\\n", ":", "\\"", "*"'
+    xtc_threshold: float = 0.1
+    xtc_probability: float = 0
     truncation_length: int = 0
     max_tokens_second: int = 0
     prompt_lookup_num_tokens: int = 0
diff --git a/modules/loaders.py b/modules/loaders.py
index 549de5fb..16a3106e 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -168,6 +168,8 @@ def transformers_samplers():
         'dry_base',
         'dry_allowed_length',
         'dry_sequence_breakers',
+        'xtc_threshold',
+        'xtc_probability',
         'seed',
         'do_sample',
         'penalty_alpha',
@@ -242,6 +244,8 @@ loaders_samplers = {
         'dry_base',
         'dry_allowed_length',
         'dry_sequence_breakers',
+        'xtc_threshold',
+        'xtc_probability',
         'seed',
         'do_sample',
         'mirostat_mode',
@@ -304,6 +308,8 @@ loaders_samplers = {
         'dry_base',
         'dry_allowed_length',
         'dry_sequence_breakers',
+        'xtc_threshold',
+        'xtc_probability',
         'seed',
         'do_sample',
         'mirostat_mode',
diff --git a/modules/presets.py b/modules/presets.py
index 06e2be70..c8118fb3 100644
--- a/modules/presets.py
+++ b/modules/presets.py
@@ -44,7 +44,9 @@ def default_preset():
         'dry_base': 1.75,
         'dry_allowed_length': 2,
         'dry_sequence_breakers': '"\\n", ":", "\\"", "*"',
-        'sampler_priority': 'repetition_penalty\npresence_penalty\nfrequency_penalty\ndry\ntemperature\ndynamic_temperature\nquadratic_sampling\ntop_k\ntop_p\ntypical_p\nepsilon_cutoff\neta_cutoff\ntfs\ntop_a\nmin_p\nmirostat\nencoder_repetition_penalty\nno_repeat_ngram'
+        'xtc_threshold': 0.1,
+        'xtc_probability': 0,
+        'sampler_priority': 'repetition_penalty\npresence_penalty\nfrequency_penalty\ndry\ntemperature\ndynamic_temperature\nquadratic_sampling\ntop_k\ntop_p\ntypical_p\nepsilon_cutoff\neta_cutoff\ntfs\ntop_a\nmin_p\nmirostat\nxtc\nencoder_repetition_penalty\nno_repeat_ngram'
     }
 
 
diff --git a/modules/sampler_hijack.py b/modules/sampler_hijack.py
index 65503c12..6d92978e 100644
--- a/modules/sampler_hijack.py
+++ b/modules/sampler_hijack.py
@@ -1,6 +1,7 @@
 import json
 import math
 import pprint
+import random
 
 import torch
 import transformers
@@ -191,6 +192,53 @@ class TopALogitsWarper(LogitsWarper):
         return scores
 
 
+# Exclude Top Choices (XTC)
+class XTCLogitsWarper(LogitsWarper):
+    def __init__(self, threshold: float, probability: float, filter_value: float = -float("Inf")):
+        self.threshold = threshold
+        self.probability = probability
+        self.filter_value = filter_value
+        self.special_token_ids = [
+            shared.tokenizer.encode("\n")[-1],
+        ]
+
+        if shared.tokenizer.eos_token_id is not None:
+            self.special_token_ids.append(shared.tokenizer.eos_token_id)
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        # `random` returns values in the half-open range [0, 1), so setting `probability`
+        # to 0 means the sampler never takes action, while setting it to 1 means the sampler
+        # always takes action.
+        #
+        # Note that while XTC is most intuitively described as "if multiple tokens meet
+        # the threshold, then with probability...", reversing the two conditions is logically
+        # equivalent, and improves performance because processing can immediately be stopped
+        # if the random check fails.
+        if random.random() >= self.probability:
+            return scores
+
+        sorted_logits, sorted_indices = torch.sort(scores, descending=True)
+        probs = sorted_logits.softmax(dim=-1)
+
+        sorted_indices_to_remove = torch.full_like(probs, False, dtype=torch.bool)
+
+        # This operation sets exactly those indices to `True` for which the next index has
+        # probability above the threshold. Since `probs` is sorted, those are the indices
+        # of all tokens that meet the threshold, *except* the least probable one.
+        sorted_indices_to_remove[..., :-1] = probs[..., 1:] >= self.threshold
+
+        # Convert sorted_indices_to_remove to the original indices
+        indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+
+        # If newline or EOS tokens would be removed, return the original scores
+        if indices_to_remove[:, self.special_token_ids].any():
+            return scores
+
+        # Otherwise, remove tokens with the mask
+        scores = scores.masked_fill(indices_to_remove, self.filter_value)
+        return scores
+
+
 class DRYLogitsProcessor(LogitsProcessor):
     def __init__(self, multiplier: float, base: float, allowed_length: int, sequence_breakers: set[int], _range: int):
         self.multiplier = multiplier
@@ -474,6 +522,14 @@ def get_logits_processor_patch(self, **kwargs):
             )
         )
 
+    if generation_config.xtc_probability is not None and generation_config.xtc_probability > 0:
+        warpers_to_add.append(
+            XTCLogitsWarper(
+                threshold=generation_config.xtc_threshold,
+                probability=generation_config.xtc_probability,
+            )
+        )
+
     if generation_config.dynamic_temperature:
         warpers_to_add.append(
             DynamicTemperatureLogitsWarper(
@@ -534,6 +590,7 @@ def get_logits_processor_patch(self, **kwargs):
         'TopKLogitsWarper': 'top_k',
         'TopPLogitsWarper': 'top_p',
         'TypicalLogitsWarper': 'typical_p',
+        'XTCLogitsWarper': 'xtc',
         'RepetitionPenaltyLogitsProcessorWithRange': 'repetition_penalty',
         'PresencePenaltyLogitsProcessor': 'presence_penalty',
         'FrequencyPenaltyLogitsProcessor': 'frequency_penalty',
@@ -588,8 +645,10 @@ def generation_config_init_patch(self, **kwargs):
     self.dry_base = kwargs.pop("dry_base", 1.75)
     self.dry_allowed_length = kwargs.pop("dry_allowed_length", 2)
     self.dry_sequence_breakers = kwargs.pop("dry_sequence_breakers", '"\\n", ":", "\\"", "*"')
+    self.xtc_threshold = kwargs.pop("xtc_threshold", 0.1)
+    self.xtc_probability = kwargs.pop("xtc_probability", 0)
     self.temperature_last = kwargs.pop("temperature_last", False)
-    self.sampler_priority = kwargs.pop("sampler_priority", ['repetition_penalty', 'presence_penalty', 'frequency_penalty', 'dry', 'temperature', 'dynamic_temperature', 'quadratic_sampling', 'top_k', 'top_p', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'tfs', 'top_a', 'min_p', 'mirostat', 'encoder_repetition_penalty', 'no_repeat_ngram'])
+    self.sampler_priority = kwargs.pop("sampler_priority", ['repetition_penalty', 'presence_penalty', 'frequency_penalty', 'dry', 'temperature', 'dynamic_temperature', 'quadratic_sampling', 'top_k', 'top_p', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'tfs', 'top_a', 'min_p', 'mirostat', 'xtc', 'encoder_repetition_penalty', 'no_repeat_ngram'])
 
 
 def hijack_samplers():
diff --git a/modules/text_generation.py b/modules/text_generation.py
index e7a2b43f..654eff89 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -289,7 +289,7 @@ def get_reply_from_output_ids(output_ids, state=None, starting_from=0):
 
 def generate_reply_HF(question, original_question, seed, state, stopping_strings=None, is_chat=False):
     generate_params = {}
-    for k in ['max_new_tokens', 'temperature', 'temperature_last', 'dynamic_temperature', 'dynatemp_low', 'dynatemp_high', 'dynatemp_exponent', 'smoothing_factor', 'smoothing_curve', 'top_p', 'min_p', 'top_k', 'repetition_penalty', 'presence_penalty', 'frequency_penalty', 'repetition_penalty_range', 'typical_p', 'tfs', 'top_a', 'guidance_scale', 'penalty_alpha', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'do_sample', 'encoder_repetition_penalty', 'no_repeat_ngram_size', 'dry_multiplier', 'dry_base', 'dry_allowed_length', 'dry_sequence_breakers']:
+    for k in ['max_new_tokens', 'temperature', 'temperature_last', 'dynamic_temperature', 'dynatemp_low', 'dynatemp_high', 'dynatemp_exponent', 'smoothing_factor', 'smoothing_curve', 'top_p', 'min_p', 'top_k', 'repetition_penalty', 'presence_penalty', 'frequency_penalty', 'repetition_penalty_range', 'typical_p', 'tfs', 'top_a', 'guidance_scale', 'penalty_alpha', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'do_sample', 'encoder_repetition_penalty', 'no_repeat_ngram_size', 'dry_multiplier', 'dry_base', 'dry_allowed_length', 'dry_sequence_breakers', 'xtc_threshold', 'xtc_probability']:
         if k in state:
             generate_params[k] = state[k]
 
diff --git a/modules/ui.py b/modules/ui.py
index 47f92cf0..76b1c009 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -158,6 +158,8 @@ def list_interface_input_elements():
         'dry_base',
         'dry_allowed_length',
         'dry_sequence_breakers',
+        'xtc_threshold',
+        'xtc_probability',
         'do_sample',
         'penalty_alpha',
         'mirostat_mode',
diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py
index eff62c20..a2665e0d 100644
--- a/modules/ui_parameters.py
+++ b/modules/ui_parameters.py
@@ -45,6 +45,10 @@ def create_ui(default_preset):
                                 shared.gradio['dry_base'] = gr.Slider(1, 4, value=generate_params['dry_base'], step=0.01, label='dry_base', info='Controls how fast the penalty grows with increasing sequence length.')
                                 shared.gradio['dry_sequence_breakers'] = gr.Textbox(value=generate_params['dry_sequence_breakers'], label='dry_sequence_breakers', info='Tokens across which sequence matching is not continued. Specified as a comma-separated list of quoted strings.')
 
+                            with gr.Blocks():
+                                shared.gradio['xtc_threshold'] = gr.Slider(0, 0.5, value=generate_params['xtc_threshold'], step=0.01, label='xtc_threshold', info='If 2 or more tokens have probability above this threshold, consider removing all but the last one.')
+                                shared.gradio['xtc_probability'] = gr.Slider(0, 1, value=generate_params['xtc_probability'], step=0.01, label='xtc_probability', info='Probability that the removal will actually happen. 0 disables the sampler. 1 makes it always happen.')
+
                             gr.Markdown("[Learn more](https://github.com/oobabooga/text-generation-webui/wiki/03-%E2%80%90-Parameters-Tab)")
 
                         with gr.Column():

From 46996f6519d483841a49c8a857104a6bf6e2ac7a Mon Sep 17 00:00:00 2001
From: RandoInternetPreson <aaronalai1@gmail.com>
Date: Fri, 27 Sep 2024 23:26:03 -0400
Subject: [PATCH 12/41] ExllamaV2 tensor parallelism to increase multi gpu
 inference speeds (#6356)

---
 modules/exllamav2.py    | 26 ++++++++++++++++++--------
 modules/exllamav2_hf.py | 28 +++++++++++++++++++---------
 modules/shared.py       |  1 +
 3 files changed, 38 insertions(+), 17 deletions(-)

diff --git a/modules/exllamav2.py b/modules/exllamav2.py
index a770e342..42b9ade1 100644
--- a/modules/exllamav2.py
+++ b/modules/exllamav2.py
@@ -7,6 +7,7 @@ from exllamav2 import (
     ExLlamaV2Cache,
     ExLlamaV2Cache_8bit,
     ExLlamaV2Cache_Q4,
+    ExLlamaV2Cache_TP,
     ExLlamaV2Config,
     ExLlamaV2Tokenizer
 )
@@ -54,21 +55,30 @@ class Exllamav2Model:
 
         model = ExLlamaV2(config)
 
-        if not shared.args.autosplit:
-            split = None
-            if shared.args.gpu_split:
-                split = [float(alloc) for alloc in shared.args.gpu_split.split(",")]
+        split = None
+        if shared.args.gpu_split:
+            split = [float(alloc) for alloc in shared.args.gpu_split.split(",")]
 
+        if shared.args.enable_tp:
+            model.load_tp(split)
+        elif not shared.args.autosplit:
             model.load(split)
 
+        # Determine the correct cache type
         if shared.args.cache_8bit:
-            cache = ExLlamaV2Cache_8bit(model, lazy=shared.args.autosplit)
+            cache_type = ExLlamaV2Cache_8bit
         elif shared.args.cache_4bit:
-            cache = ExLlamaV2Cache_Q4(model, lazy=shared.args.autosplit)
+            cache_type = ExLlamaV2Cache_Q4
         else:
-            cache = ExLlamaV2Cache(model, lazy=shared.args.autosplit)
+            cache_type = ExLlamaV2Cache
 
-        if shared.args.autosplit:
+        # Use TP if specified
+        if shared.args.enable_tp:
+            cache = ExLlamaV2Cache_TP(model, base=cache_type)
+        else:
+            cache = cache_type(model, lazy=shared.args.autosplit)
+
+        if shared.args.autosplit and not shared.args.enable_tp:
             model.load_autosplit(cache)
 
         tokenizer = ExLlamaV2Tokenizer(config)
diff --git a/modules/exllamav2_hf.py b/modules/exllamav2_hf.py
index 53143d9a..febb2c64 100644
--- a/modules/exllamav2_hf.py
+++ b/modules/exllamav2_hf.py
@@ -9,6 +9,7 @@ from exllamav2 import (
     ExLlamaV2Cache,
     ExLlamaV2Cache_8bit,
     ExLlamaV2Cache_Q4,
+    ExLlamaV2Cache_TP,
     ExLlamaV2Config
 )
 from torch.nn import CrossEntropyLoss
@@ -42,21 +43,30 @@ class Exllamav2HF(PreTrainedModel):
 
         self.ex_model = ExLlamaV2(config)
 
-        if not shared.args.autosplit:
-            split = None
-            if shared.args.gpu_split:
-                split = [float(alloc) for alloc in shared.args.gpu_split.split(",")]
+        split = None
+        if shared.args.gpu_split:
+            split = [float(alloc) for alloc in shared.args.gpu_split.split(",")]
 
-            self.ex_model.load(split)
+        if shared.args.enable_tp:
+            model.load_tp(split)
+        elif not shared.args.autosplit:
+            model.load(split)
 
+        # Determine the correct cache type
         if shared.args.cache_8bit:
-            self.ex_cache = ExLlamaV2Cache_8bit(self.ex_model, lazy=shared.args.autosplit)
+            cache_type = ExLlamaV2Cache_8bit
         elif shared.args.cache_4bit:
-            self.ex_cache = ExLlamaV2Cache_Q4(self.ex_model, lazy=shared.args.autosplit)
+            cache_type = ExLlamaV2Cache_Q4
         else:
-            self.ex_cache = ExLlamaV2Cache(self.ex_model, lazy=shared.args.autosplit)
+            cache_type = ExLlamaV2Cache
 
-        if shared.args.autosplit:
+        # Use TP if specified
+        if shared.args.enable_tp:
+            self.ex_cache = ExLlamaV2Cache_TP(self.ex_model, base=cache_type)
+        else:
+            self.ex_cache = cache_type(self.ex_model, lazy=shared.args.autosplit)
+
+        if shared.args.autosplit and not shared.args.enable_tp:
             self.ex_model.load_autosplit(self.ex_cache)
 
         self.past_seq = None
diff --git a/modules/shared.py b/modules/shared.py
index 43533a14..894ed6fe 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -146,6 +146,7 @@ group.add_argument('--no_sdpa', action='store_true', help='Force Torch SDPA to n
 group.add_argument('--cache_8bit', action='store_true', help='Use 8-bit cache to save VRAM.')
 group.add_argument('--cache_4bit', action='store_true', help='Use Q4 cache to save VRAM.')
 group.add_argument('--num_experts_per_token', type=int, default=2, help='Number of experts to use for generation. Applies to MoE models like Mixtral.')
+group.add_argument('--enable_tp', action='store_true', help='Enable Tensor Parallelism (TP) in ExLlamaV2.')
 
 # AutoGPTQ
 group = parser.add_argument_group('AutoGPTQ')

From 7276dca933626eef587ac0adf53ab8ea06c2e9ac Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 27 Sep 2024 20:26:36 -0700
Subject: [PATCH 13/41] Fix a typo

---
 modules/exllamav2_hf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/exllamav2_hf.py b/modules/exllamav2_hf.py
index febb2c64..96a89429 100644
--- a/modules/exllamav2_hf.py
+++ b/modules/exllamav2_hf.py
@@ -48,9 +48,9 @@ class Exllamav2HF(PreTrainedModel):
             split = [float(alloc) for alloc in shared.args.gpu_split.split(",")]
 
         if shared.args.enable_tp:
-            model.load_tp(split)
+            self.ex_model.load_tp(split)
         elif not shared.args.autosplit:
-            model.load(split)
+            self.ex_model.load(split)
 
         # Determine the correct cache type
         if shared.args.cache_8bit:

From ca5a2dba72817def91b616e9571d8490b9b64a13 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 28 Sep 2024 09:39:37 -0700
Subject: [PATCH 14/41] Bump rocm to 6.1.2

---
 README.md                   | 2 +-
 one_click.py                | 6 +++---
 requirements_amd.txt        | 8 ++++----
 requirements_amd_noavx2.txt | 4 ++--
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 22ceeeb8..4449639d 100644
--- a/README.md
+++ b/README.md
@@ -82,7 +82,7 @@ conda activate textgen
 |--------|---------|---------|
 | Linux/WSL | NVIDIA | `pip3 install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121` |
 | Linux/WSL | CPU only | `pip3 install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cpu` |
-| Linux | AMD | `pip3 install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/rocm5.6` |
+| Linux | AMD | `pip3 install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/rocm6.1` |
 | MacOS + MPS | Any | `pip3 install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2` |
 | Windows | NVIDIA | `pip3 install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121` |
 | Windows | CPU only | `pip3 install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2` |
diff --git a/one_click.py b/one_click.py
index d35ecc89..9b813873 100644
--- a/one_click.py
+++ b/one_click.py
@@ -117,7 +117,7 @@ def update_pytorch():
     elif is_cuda:
         install_pytorch += "--index-url https://download.pytorch.org/whl/cu121"
     elif is_rocm:
-        install_pytorch += "--index-url https://download.pytorch.org/whl/rocm5.6"
+        install_pytorch += "--index-url https://download.pytorch.org/whl/rocm6.1"
     elif is_cpu:
         install_pytorch += "--index-url https://download.pytorch.org/whl/cpu"
     elif is_intel:
@@ -239,7 +239,7 @@ def install_webui():
             "What is your GPU?",
             {
                 'A': 'NVIDIA',
-                'B': 'AMD (Linux/MacOS only. Requires ROCm SDK 5.6 on Linux)',
+                'B': 'AMD (Linux/MacOS only. Requires ROCm SDK 6.1 on Linux)',
                 'C': 'Apple M Series',
                 'D': 'Intel Arc (IPEX)',
                 'N': 'None (I want to run models in CPU mode)'
@@ -294,7 +294,7 @@ def install_webui():
         else:
             install_pytorch += "--index-url https://download.pytorch.org/whl/cu121"
     elif selected_gpu == "AMD":
-        install_pytorch += "--index-url https://download.pytorch.org/whl/rocm5.6"
+        install_pytorch += "--index-url https://download.pytorch.org/whl/rocm6.1"
     elif selected_gpu in ["APPLE", "NONE"]:
         install_pytorch += "--index-url https://download.pytorch.org/whl/cpu"
     elif selected_gpu == "INTEL":
diff --git a/requirements_amd.txt b/requirements_amd.txt
index b224ce14..9f46ed4f 100644
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@@ -40,10 +40,10 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cp
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.90+rocm5.6.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.90+rocm5.6.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+rocm5.6.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+rocm5.6.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.3.0+rocm6.1.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.3.0+rocm6.1.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+rocm6.1.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+rocm6.1.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
 https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt
index 3cdb8a7a..9c5fb5f2 100644
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@@ -40,8 +40,8 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cp
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # AMD wheels
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+rocm5.6.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+rocm5.6.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+rocm6.1.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+rocm6.1.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
 https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"

From 85994e3ef0f75b408ef5b9a6995902eb600be970 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 28 Sep 2024 09:44:08 -0700
Subject: [PATCH 15/41] Bump pytorch to 2.4.1

---
 README.md                   | 14 +++++++-------
 one_click.py                |  6 +++---
 requirements.txt            | 16 ++++++++--------
 requirements_amd.txt        |  4 ++--
 requirements_amd_noavx2.txt |  4 ++--
 requirements_noavx2.txt     | 16 ++++++++--------
 6 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/README.md b/README.md
index 4449639d..350dd090 100644
--- a/README.md
+++ b/README.md
@@ -80,12 +80,12 @@ conda activate textgen
 
 | System | GPU | Command |
 |--------|---------|---------|
-| Linux/WSL | NVIDIA | `pip3 install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121` |
-| Linux/WSL | CPU only | `pip3 install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cpu` |
-| Linux | AMD | `pip3 install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/rocm6.1` |
-| MacOS + MPS | Any | `pip3 install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2` |
-| Windows | NVIDIA | `pip3 install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121` |
-| Windows | CPU only | `pip3 install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2` |
+| Linux/WSL | NVIDIA | `pip3 install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cu121` |
+| Linux/WSL | CPU only | `pip3 install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cpu` |
+| Linux | AMD | `pip3 install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/rocm6.1` |
+| MacOS + MPS | Any | `pip3 install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1` |
+| Windows | NVIDIA | `pip3 install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cu121` |
+| Windows | CPU only | `pip3 install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1` |
 
 The up-to-date commands can be found here: https://pytorch.org/get-started/locally/.
 
@@ -150,7 +150,7 @@ Then browse to
 1) For Kepler GPUs and older, you will need to install CUDA 11.8 instead of 12:
 
 ```
-pip3 install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu118
+pip3 install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cu118
 conda install -y -c "nvidia/label/cuda-11.8.0" cuda-runtime
 ```
 
diff --git a/one_click.py b/one_click.py
index 9b813873..0f9c703d 100644
--- a/one_click.py
+++ b/one_click.py
@@ -16,9 +16,9 @@ import sys
 
 
 # Define the required PyTorch version
-TORCH_VERSION = "2.2.2"
-TORCHVISION_VERSION = "0.17.2"
-TORCHAUDIO_VERSION = "2.2.2"
+TORCH_VERSION = "2.4.1"
+TORCHVISION_VERSION = "0.19.1"
+TORCHAUDIO_VERSION = "2.4.1"
 
 # Environment
 script_dir = os.getcwd()
diff --git a/requirements.txt b/requirements.txt
index 54bd414a..25b4b49d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -55,15 +55,15 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/te
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.0+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # CUDA wheels
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu121.torch2.2.2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu121.torch2.2.2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu121.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu121.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu121.torch2.4.1-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu121.torch2.4.1-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu121.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu121.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
-https://github.com/oobabooga/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu122torch2.2.2cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu122torch2.2.2cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu122torch2.4.1cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu122torch2.4.1cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
diff --git a/requirements_amd.txt b/requirements_amd.txt
index 9f46ed4f..cd17a600 100644
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@@ -42,8 +42,8 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cp
 # AMD wheels
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.3.0+rocm6.1.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.3.0+rocm6.1.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+rocm6.1.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+rocm6.1.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+rocm6.1.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+rocm6.1.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
 https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt
index 9c5fb5f2..dd9bd893 100644
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@@ -40,8 +40,8 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cp
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # AMD wheels
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+rocm6.1.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+rocm6.1.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+rocm6.1.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+rocm6.1.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
 https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index 39bfb0bd..ec0bbb1d 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -55,15 +55,15 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/te
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.0+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # CUDA wheels
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu121.torch2.2.2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu121.torch2.2.2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu121.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu121.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu121.torch2.4.1-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu121.torch2.4.1-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu121.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu121.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
-https://github.com/oobabooga/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu122torch2.2.2cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu122torch2.2.2cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu122torch2.4.1cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu122torch2.4.1cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"

From 1a870b3ea7f0b3337a0c0202fe4b4dddb4e559a0 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 28 Sep 2024 19:38:56 -0700
Subject: [PATCH 16/41] Remove AutoAWQ and AutoGPTQ from requirements (no
 wheels available)

---
 requirements.txt            | 9 ---------
 requirements_amd.txt        | 4 ----
 requirements_amd_noavx2.txt | 4 ----
 requirements_noavx2.txt     | 9 ---------
 4 files changed, 26 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 25b4b49d..f42247a6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,5 @@
 accelerate==0.33.*
 aqlm[gpu,cpu]==1.1.6; platform_system == "Linux"
-auto-gptq==0.7.1
 bitsandbytes==0.44.*
 colorama
 datasets
@@ -64,11 +63,3 @@ https://github.com/oobabooga/flash-attention/releases/download/v2.6.3/flash_attn
 https://github.com/oobabooga/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu122torch2.4.1cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
diff --git a/requirements_amd.txt b/requirements_amd.txt
index cd17a600..88fdcad7 100644
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@@ -45,7 +45,3 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/ro
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+rocm6.1.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+rocm6.1.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
-https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7+rocm561-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7+rocm561-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt
index dd9bd893..0ad440e3 100644
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@@ -43,7 +43,3 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cp
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+rocm6.1.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+rocm6.1.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
-https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7+rocm561-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7+rocm561-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index ec0bbb1d..61981ca5 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -1,6 +1,5 @@
 accelerate==0.33.*
 aqlm[gpu,cpu]==1.1.6; platform_system == "Linux"
-auto-gptq==0.7.1
 bitsandbytes==0.44.*
 colorama
 datasets
@@ -64,11 +63,3 @@ https://github.com/oobabooga/flash-attention/releases/download/v2.6.3/flash_attn
 https://github.com/oobabooga/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu122torch2.4.1cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"

From 65e58640846d65200364b4a65a7975ce48eb4618 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 28 Sep 2024 20:25:26 -0700
Subject: [PATCH 17/41] Update README

---
 README.md | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index 350dd090..fdd9c46c 100644
--- a/README.md
+++ b/README.md
@@ -10,17 +10,17 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 
 ## Features
 
-* Multiple backends for text generation in a single UI and API, including [Transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp) (through [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)), [ExLlamaV2](https://github.com/turboderp/exllamav2), [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM). [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [HQQ](https://github.com/mobiusml/hqq), and [AQLM](https://github.com/Vahe1994/AQLM) are also supported through the Transformers loader.
-* OpenAI-compatible API server with Chat and Completions endpoints – see the [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples).
-* Automatic prompt formatting for each model using the Jinja2 template in its metadata.
-* Three chat modes: `instruct`, `chat-instruct`, and `chat`, allowing for both instruction-following and casual conversations with characters. `chat-instruct` mode automatically applies the model's template to the chat prompt, ensuring high-quality outputs without manual setup.
-* "Past chats" menu to quickly switch between conversations and start new ones.
-* Free-form generation in the Default/Notebook tabs without being limited to chat turns. Send formatted chat conversations from the Chat tab to these tabs.
-* Multiple sampling parameters and generation options for sophisticated text generation control.
-* Easy switching between different models through the UI without restarting, using the "Model" tab.
-* Simple LoRA fine-tuning tool to customize models with your data.
-* All in one folder. The requirements are installed in a self-contained `installer_files` folder that doesn't interfere with the system's environment.
-* Extensions support, including numerous built-in and user-contributed extensions. See [the wiki](https://github.com/oobabooga/text-generation-webui/wiki/07-%E2%80%90-Extensions) and [the extensions directory](https://github.com/oobabooga/text-generation-webui-extensions) for details.
+- Supports multiple text generation backends in one UI/API, including [Transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp), [ExLlamaV2](https://github.com/turboderp/exllamav2), [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM). Also supports [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [HQQ](https://github.com/mobiusml/hqq), and [AQLM](https://github.com/Vahe1994/AQLM) through the Transformers loader.
+- OpenAI-compatible API with Chat and Completions endpoints – see [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples).
+- Automatic prompt formatting using Jinja2 templates.
+- Three chat modes: `instruct`, `chat-instruct`, and `chat`, with auto-prompt templates in `chat-instruct`.
+- "Past chats" menu for easy conversation switching.
+- Free-form text generation in the Default/Notebook tabs without being limited to chat turns. You can send formatted conversations from the Chat tab to these.
+- Multiple sampling parameters and generation options for sophisticated text generation control.
+- Switch models easily in the UI without restarting.
+- Simple LoRA fine-tuning tool.
+- Requirements installed in a self-contained `installer_files` directory that doesn't interfere with the system environment.
+- Extension support, with numerous built-in and user-contributed extensions available. See the [wiki](https://github.com/oobabooga/text-generation-webui/wiki/07-%E2%80%90-Extensions) and [extensions directory](https://github.com/oobabooga/text-generation-webui-extensions) for details.
 
 ## How to install
 

From b92d7fd43e6f39410d4fc486e6168160b449aacc Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 28 Sep 2024 20:30:24 -0700
Subject: [PATCH 18/41] Add warnings for when AutoGPTQ, TensorRT-LLM, or HQQ
 are missing

---
 modules/models.py | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/modules/models.py b/modules/models.py
index b0e2346e..3e8ae3f2 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -70,11 +70,11 @@ def load_model(model_name, loader=None):
     shared.model_name = model_name
     load_func_map = {
         'Transformers': huggingface_loader,
-        'AutoGPTQ': AutoGPTQ_loader,
         'llama.cpp': llamacpp_loader,
         'llamacpp_HF': llamacpp_HF_loader,
         'ExLlamav2': ExLlamav2_loader,
         'ExLlamav2_HF': ExLlamav2_HF_loader,
+        'AutoGPTQ': AutoGPTQ_loader,
         'HQQ': HQQ_loader,
         'TensorRT-LLM': TensorRT_LLM_loader,
     }
@@ -302,12 +302,6 @@ def llamacpp_HF_loader(model_name):
         return model
 
 
-def AutoGPTQ_loader(model_name):
-    import modules.AutoGPTQ_loader
-
-    return modules.AutoGPTQ_loader.load_quantized(model_name)
-
-
 def ExLlamav2_loader(model_name):
     from modules.exllamav2 import Exllamav2Model
 
@@ -321,9 +315,21 @@ def ExLlamav2_HF_loader(model_name):
     return Exllamav2HF.from_pretrained(model_name)
 
 
+def AutoGPTQ_loader(model_name):
+    try:
+        import modules.AutoGPTQ_loader
+    except ModuleNotFoundError:
+        raise ModuleNotFoundError("Failed to import 'autogptq'. Please install it manually following the instructions in the AutoGPTQ GitHub repository.")
+
+    return modules.AutoGPTQ_loader.load_quantized(model_name)
+
+
 def HQQ_loader(model_name):
-    from hqq.core.quantize import HQQBackend, HQQLinear
-    from hqq.models.hf.base import AutoHQQHFModel
+    try:
+        from hqq.core.quantize import HQQBackend, HQQLinear
+        from hqq.models.hf.base import AutoHQQHFModel
+    except ModuleNotFoundError:
+        raise ModuleNotFoundError("Failed to import 'hqq'. Please install it manually following the instructions in the HQQ GitHub repository.")
 
     logger.info(f"Loading HQQ model with backend: \"{shared.args.hqq_backend}\"")
 
@@ -334,7 +340,10 @@ def HQQ_loader(model_name):
 
 
 def TensorRT_LLM_loader(model_name):
-    from modules.tensorrt_llm import TensorRTLLMModel
+    try:
+        from modules.tensorrt_llm import TensorRTLLMModel
+    except ModuleNotFoundError:
+        raise ModuleNotFoundError("Failed to import 'tensorrt_llm'. Please install it manually following the instructions in the TensorRT-LLM GitHub repository.")
 
     model = TensorRTLLMModel.from_pretrained(model_name)
     return model

From c61b29b9ce143e778fc717accd70e0bbabede720 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 28 Sep 2024 20:33:17 -0700
Subject: [PATCH 19/41] Simplify the warning when flash-attn fails to import

---
 modules/exllamav2.py    | 8 --------
 modules/exllamav2_hf.py | 8 --------
 2 files changed, 16 deletions(-)

diff --git a/modules/exllamav2.py b/modules/exllamav2.py
index 42b9ade1..0498c488 100644
--- a/modules/exllamav2.py
+++ b/modules/exllamav2.py
@@ -19,14 +19,6 @@ from modules.text_generation import get_max_prompt_length
 
 try:
     import flash_attn
-except ModuleNotFoundError:
-    logger.warning(
-        'You are running ExLlamaV2 without flash-attention. This will cause the VRAM usage '
-        'to be a lot higher than it could be.\n'
-        'Try installing flash-attention following the instructions here: '
-        'https://github.com/Dao-AILab/flash-attention#installation-and-features'
-    )
-    pass
 except Exception:
     logger.warning('Failed to load flash-attention due to the following error:\n')
     traceback.print_exc()
diff --git a/modules/exllamav2_hf.py b/modules/exllamav2_hf.py
index 96a89429..320a8d24 100644
--- a/modules/exllamav2_hf.py
+++ b/modules/exllamav2_hf.py
@@ -21,14 +21,6 @@ from modules.logging_colors import logger
 
 try:
     import flash_attn
-except ModuleNotFoundError:
-    logger.warning(
-        'You are running ExLlamaV2 without flash-attention. This will cause the VRAM usage '
-        'to be a lot higher than it could be.\n'
-        'Try installing flash-attention following the instructions here: '
-        'https://github.com/Dao-AILab/flash-attention#installation-and-features'
-    )
-    pass
 except Exception:
     logger.warning('Failed to load flash-attention due to the following error:\n')
     traceback.print_exc()

From 3b99532e0268e7c516e58c1c83931c6843114cc2 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 28 Sep 2024 20:34:59 -0700
Subject: [PATCH 20/41] Remove HQQ and AQLM from requirements

---
 requirements.txt                 | 2 --
 requirements_amd.txt             | 1 -
 requirements_amd_noavx2.txt      | 1 -
 requirements_apple_intel.txt     | 1 -
 requirements_apple_silicon.txt   | 1 -
 requirements_cpu_only.txt        | 1 -
 requirements_cpu_only_noavx2.txt | 1 -
 requirements_noavx2.txt          | 2 --
 requirements_nowheels.txt        | 1 -
 9 files changed, 11 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index f42247a6..d3531b9c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,12 +1,10 @@
 accelerate==0.33.*
-aqlm[gpu,cpu]==1.1.6; platform_system == "Linux"
 bitsandbytes==0.44.*
 colorama
 datasets
 einops
 fastapi==0.112.4
 gradio==4.26.*
-hqq==0.1.7.post3
 jinja2==3.1.4
 lm_eval==0.3.0
 markdown
diff --git a/requirements_amd.txt b/requirements_amd.txt
index 88fdcad7..1be85205 100644
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@@ -4,7 +4,6 @@ datasets
 einops
 fastapi==0.112.4
 gradio==4.26.*
-hqq==0.1.7.post3
 jinja2==3.1.4
 lm_eval==0.3.0
 markdown
diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt
index 0ad440e3..5026fa6d 100644
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@@ -4,7 +4,6 @@ datasets
 einops
 fastapi==0.112.4
 gradio==4.26.*
-hqq==0.1.7.post3
 jinja2==3.1.4
 lm_eval==0.3.0
 markdown
diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt
index fd91464d..eee8112d 100644
--- a/requirements_apple_intel.txt
+++ b/requirements_apple_intel.txt
@@ -4,7 +4,6 @@ datasets
 einops
 fastapi==0.112.4
 gradio==4.26.*
-hqq==0.1.7.post3
 jinja2==3.1.4
 lm_eval==0.3.0
 markdown
diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt
index 8a50c8b4..fa244a93 100644
--- a/requirements_apple_silicon.txt
+++ b/requirements_apple_silicon.txt
@@ -4,7 +4,6 @@ datasets
 einops
 fastapi==0.112.4
 gradio==4.26.*
-hqq==0.1.7.post3
 jinja2==3.1.4
 lm_eval==0.3.0
 markdown
diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt
index 4e89b6ad..4c970b91 100644
--- a/requirements_cpu_only.txt
+++ b/requirements_cpu_only.txt
@@ -4,7 +4,6 @@ datasets
 einops
 fastapi==0.112.4
 gradio==4.26.*
-hqq==0.1.7.post3
 jinja2==3.1.4
 lm_eval==0.3.0
 markdown
diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt
index 2c00ccf5..4b22cb86 100644
--- a/requirements_cpu_only_noavx2.txt
+++ b/requirements_cpu_only_noavx2.txt
@@ -4,7 +4,6 @@ datasets
 einops
 fastapi==0.112.4
 gradio==4.26.*
-hqq==0.1.7.post3
 jinja2==3.1.4
 lm_eval==0.3.0
 markdown
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index 61981ca5..fe6adb46 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -1,12 +1,10 @@
 accelerate==0.33.*
-aqlm[gpu,cpu]==1.1.6; platform_system == "Linux"
 bitsandbytes==0.44.*
 colorama
 datasets
 einops
 fastapi==0.112.4
 gradio==4.26.*
-hqq==0.1.7.post3
 jinja2==3.1.4
 lm_eval==0.3.0
 markdown
diff --git a/requirements_nowheels.txt b/requirements_nowheels.txt
index e7cbc29c..5c840f56 100644
--- a/requirements_nowheels.txt
+++ b/requirements_nowheels.txt
@@ -4,7 +4,6 @@ datasets
 einops
 fastapi==0.112.4
 gradio==4.26.*
-hqq==0.1.7.post3
 jinja2==3.1.4
 lm_eval==0.3.0
 markdown

From 3fb02f43f648c7735f7ed8b340c3d36d344aa7b6 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 28 Sep 2024 20:38:43 -0700
Subject: [PATCH 21/41] Update README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index fdd9c46c..59cfb5fc 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 
 ## Features
 
-- Supports multiple text generation backends in one UI/API, including [Transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp), [ExLlamaV2](https://github.com/turboderp/exllamav2), [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM). Also supports [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [HQQ](https://github.com/mobiusml/hqq), and [AQLM](https://github.com/Vahe1994/AQLM) through the Transformers loader.
+- Supports multiple text generation backends in one UI/API, including [Transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp), and [ExLlamaV2](https://github.com/turboderp/exllamav2).[TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [HQQ](https://github.com/mobiusml/hqq), and [AQLM](https://github.com/Vahe1994/AQLM) are also supported but you need to install them manually.
 - OpenAI-compatible API with Chat and Completions endpoints – see [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples).
 - Automatic prompt formatting using Jinja2 templates.
 - Three chat modes: `instruct`, `chat-instruct`, and `chat`, with auto-prompt templates in `chat-instruct`.

From 3f0571b62bb020e19c122d0cd30d5c5df4aa2917 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 28 Sep 2024 20:48:30 -0700
Subject: [PATCH 22/41] Update README

---
 README.md | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 59cfb5fc..954d7a7d 100644
--- a/README.md
+++ b/README.md
@@ -10,33 +10,29 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 
 ## Features
 
-- Supports multiple text generation backends in one UI/API, including [Transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp), and [ExLlamaV2](https://github.com/turboderp/exllamav2).[TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [HQQ](https://github.com/mobiusml/hqq), and [AQLM](https://github.com/Vahe1994/AQLM) are also supported but you need to install them manually.
+- Supports multiple text generation backends in one UI/API, including [Transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp), and [ExLlamaV2](https://github.com/turboderp/exllamav2). [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [HQQ](https://github.com/mobiusml/hqq), and [AQLM](https://github.com/Vahe1994/AQLM) are also supported but you need to install them manually.
 - OpenAI-compatible API with Chat and Completions endpoints – see [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples).
 - Automatic prompt formatting using Jinja2 templates.
-- Three chat modes: `instruct`, `chat-instruct`, and `chat`, with auto-prompt templates in `chat-instruct`.
-- "Past chats" menu for easy conversation switching.
+- Three chat modes: `instruct`, `chat-instruct`, and `chat`, with automatic prompt templates in `chat-instruct`.
+- "Past chats" menu to easily switch between conversations.
 - Free-form text generation in the Default/Notebook tabs without being limited to chat turns. You can send formatted conversations from the Chat tab to these.
 - Multiple sampling parameters and generation options for sophisticated text generation control.
-- Switch models easily in the UI without restarting.
+- Switch between different models easily in the UI without restarting.
 - Simple LoRA fine-tuning tool.
 - Requirements installed in a self-contained `installer_files` directory that doesn't interfere with the system environment.
 - Extension support, with numerous built-in and user-contributed extensions available. See the [wiki](https://github.com/oobabooga/text-generation-webui/wiki/07-%E2%80%90-Extensions) and [extensions directory](https://github.com/oobabooga/text-generation-webui-extensions) for details.
 
 ## How to install
 
-1) Clone or [download](https://github.com/oobabooga/text-generation-webui/archive/refs/heads/main.zip) the repository.
-2) Run the `start_linux.sh`, `start_windows.bat`, `start_macos.sh`, or `start_wsl.bat` script depending on your OS.
+1) Clone or [download the repository](https://github.com/oobabooga/text-generation-webui/archive/refs/heads/main.zip).
+2) Run the script that matches your OS: `start_linux.sh`, `start_windows.bat`, `start_macos.sh`, or `start_wsl.bat`.
 3) Select your GPU vendor when asked.
 4) Once the installation ends, browse to `http://localhost:7860`.
 5) Have fun!
 
-To restart the web UI in the future, run the `start_` script again.
+To restart the web UI later, just run the same `start_` script. If you need to reinstall, delete the `installer_files` folder created during setup and run the script again.
 
-This script creates an `installer_files` folder where it sets up the project's requirements. If you need to reinstall the requirements, just delete that folder and start the web UI again.
-
-The script accepts command-line flags, such as `./start_linux.sh --help`. Alternatively, you can edit the `CMD_FLAGS.txt` file with a text editor and add your flags there, such as `--api` in case you need to use the API.
-
-To get updates in the future, run `update_wizard_linux.sh`, `update_wizard_windows.bat`, `update_wizard_macos.sh`, or `update_wizard_wsl.bat`.
+You can use command-line flags, like `./start_linux.sh --help`, or add them to `CMD_FLAGS.txt` (such as `--api` to enable API use). To update the project, run `update_wizard_linux.sh`, `update_wizard_windows.bat`, `update_wizard_macos.sh`, or `update_wizard_wsl.bat`.
 
 <details>
 <summary>

From 57160cd6fa37c8367df3c674256af970b31e0add Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 28 Sep 2024 20:50:41 -0700
Subject: [PATCH 23/41] Update README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 954d7a7d..fb22d099 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 - OpenAI-compatible API with Chat and Completions endpoints – see [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples).
 - Automatic prompt formatting using Jinja2 templates.
 - Three chat modes: `instruct`, `chat-instruct`, and `chat`, with automatic prompt templates in `chat-instruct`.
-- "Past chats" menu to easily switch between conversations.
+- "Past chats" menu to quickly switch between conversations.
 - Free-form text generation in the Default/Notebook tabs without being limited to chat turns. You can send formatted conversations from the Chat tab to these.
 - Multiple sampling parameters and generation options for sophisticated text generation control.
 - Switch between different models easily in the UI without restarting.

From 055f3f5632f99f99409fd85d4e06c4c70acf8b33 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 28 Sep 2024 20:55:26 -0700
Subject: [PATCH 24/41] Fix after #6386 (thanks @Touch-Night)

---
 one_click.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/one_click.py b/one_click.py
index 0f9c703d..9bad25d1 100644
--- a/one_click.py
+++ b/one_click.py
@@ -189,8 +189,11 @@ def run_cmd(cmd, assert_success=False, environment=False, capture_output=False,
             conda_sh_path = os.path.join(script_dir, "installer_files", "conda", "etc", "profile.d", "conda.sh")
             cmd = f'. "{conda_sh_path}" && conda activate "{conda_env_path}" && {cmd}'
 
+    # Set executable to None for Windows, /bin/bash for everything else
+    executable = None if is_windows() else '/bin/bash'
+
     # Run shell commands
-    result = subprocess.run(cmd, shell=True, executable='/bin/bash', capture_output=capture_output, env=env)
+    result = subprocess.run(cmd, shell=True, capture_output=capture_output, env=env, executable=executable)
 
     # Assert the command ran successfully
     if assert_success and result.returncode != 0:

From 0f90a1b50fde3bc102bef1a034942ebb4b155845 Mon Sep 17 00:00:00 2001
From: Manuel Schmid <9307310+mashb1t@users.noreply.github.com>
Date: Sun, 29 Sep 2024 06:08:55 +0200
Subject: [PATCH 25/41] Do not set value for histories in chat when
 --multi-user is used (#6317)

---
 modules/chat.py | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index 00c4ffa9..b81cfea6 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -1059,7 +1059,12 @@ def handle_start_new_chat_click(state):
 
     convert_to_markdown.cache_clear()
 
-    return [history, html, gr.update(choices=histories, value=histories[0][1])]
+    if len(histories) > 0:
+        past_chats_update = gr.update(choices=histories, value=histories[0][1])
+    else:
+        past_chats_update = gr.update(choices=histories)
+
+    return [history, html, past_chats_update]
 
 
 def handle_delete_chat_confirm_click(state):
@@ -1110,10 +1115,15 @@ def handle_upload_chat_history(load_chat_history, state):
 
     convert_to_markdown.cache_clear()
 
+    if len(histories) > 0:
+        past_chats_update = gr.update(choices=histories, value=histories[0][1])
+    else:
+        past_chats_update = gr.update(choices=histories)
+
     return [
         history,
         html,
-        gr.update(choices=histories, value=histories[0][1])
+        past_chats_update
     ]
 
 
@@ -1132,6 +1142,11 @@ def handle_character_menu_change(state):
 
     convert_to_markdown.cache_clear()
 
+    if len(histories) > 0:
+        past_chats_update = gr.update(choices=histories, value=histories[0][1])
+    else:
+        past_chats_update = gr.update(choices=histories)
+
     return [
         history,
         html,
@@ -1140,7 +1155,7 @@ def handle_character_menu_change(state):
         picture,
         greeting,
         context,
-        gr.update(choices=histories, value=histories[0][1]),
+        past_chats_update,
     ]
 
 
@@ -1151,12 +1166,17 @@ def handle_mode_change(state):
 
     convert_to_markdown.cache_clear()
 
+    if len(histories) > 0:
+        past_chats_update = gr.update(choices=histories, value=histories[0][1])
+    else:
+        past_chats_update = gr.update(choices=histories)
+
     return [
         history,
         html,
         gr.update(visible=state['mode'] != 'instruct'),
         gr.update(visible=state['mode'] == 'chat-instruct'),
-        gr.update(choices=histories, value=histories[0][1])
+        past_chats_update
     ]
 
 

From e4b0467f9f4b054723528c93c4cda56a266f8687 Mon Sep 17 00:00:00 2001
From: Hanusz Leszek <leszek.hanusz@gmail.com>
Date: Sun, 29 Sep 2024 06:14:19 +0200
Subject: [PATCH 26/41] Add beforeunload event to add confirmation dialog when
 leaving page (#6279)

---
 js/main.js | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/js/main.js b/js/main.js
index 899bd8f0..71489f14 100644
--- a/js/main.js
+++ b/js/main.js
@@ -600,4 +600,15 @@ headerBar.addEventListener("click", (e) => {
   }
 });
 
+//------------------------------------------------
+// Add a confirmation dialog when leaving the page
+// Useful to avoid data loss
+//------------------------------------------------
+window.addEventListener('beforeunload', function (event) {
+  // Cancel the event
+  event.preventDefault();
+  // Chrome requires returnValue to be set
+  event.returnValue = '';
+});
+
 moveToChatTab();

From 01362681f2f9e17b6e28ee99b5cf8551a5e62161 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 29 Sep 2024 07:42:44 -0700
Subject: [PATCH 27/41] Bump exllamav2 to 0.2.4

---
 requirements.txt               | 10 +++++-----
 requirements_amd.txt           |  6 +++---
 requirements_amd_noavx2.txt    |  6 +++---
 requirements_apple_intel.txt   |  2 +-
 requirements_apple_silicon.txt |  2 +-
 requirements_noavx2.txt        | 10 +++++-----
 6 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index d3531b9c..c7ba213f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -52,11 +52,11 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/te
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.0+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # CUDA wheels
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu121.torch2.4.1-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu121.torch2.4.1-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu121.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu121.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.1-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.1-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
 https://github.com/oobabooga/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu122torch2.4.1cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu122torch2.4.1cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
diff --git a/requirements_amd.txt b/requirements_amd.txt
index 1be85205..6ce5eb62 100644
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@@ -41,6 +41,6 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cp
 # AMD wheels
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.3.0+rocm6.1.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.3.0+rocm6.1.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+rocm6.1.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+rocm6.1.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+rocm6.1.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+rocm6.1.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt
index 5026fa6d..81ad6b23 100644
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@@ -39,6 +39,6 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cp
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # AMD wheels
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+rocm6.1.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+rocm6.1.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+rocm6.1.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+rocm6.1.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt
index eee8112d..6c3a696a 100644
--- a/requirements_apple_intel.txt
+++ b/requirements_apple_intel.txt
@@ -37,4 +37,4 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/me
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.0-cp310-cp310-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.0-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.0-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2-py3-none-any.whl
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3-py3-none-any.whl
diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt
index fa244a93..86ef7197 100644
--- a/requirements_apple_silicon.txt
+++ b/requirements_apple_silicon.txt
@@ -39,4 +39,4 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/me
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.0-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10"
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.0-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.0-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2-py3-none-any.whl
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3-py3-none-any.whl
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index fe6adb46..516b0639 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -52,11 +52,11 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/te
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.0+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # CUDA wheels
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu121.torch2.4.1-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu121.torch2.4.1-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu121.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu121.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.1-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.1-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
 https://github.com/oobabooga/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu122torch2.4.1cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu122torch2.4.1cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"

From bbdeed3cf44d413eadb09a8f443b9d43ef0ce261 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 29 Sep 2024 20:45:27 -0700
Subject: [PATCH 28/41] Make sampler priority high if unspecified

---
 modules/sampler_hijack.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/modules/sampler_hijack.py b/modules/sampler_hijack.py
index 6d92978e..87f0b25e 100644
--- a/modules/sampler_hijack.py
+++ b/modules/sampler_hijack.py
@@ -602,11 +602,10 @@ def get_logits_processor_patch(self, **kwargs):
     def custom_sort_key(obj):
         class_name = obj.__class__.__name__
 
-        # Return a large value if class name is not mapped or if the mapped nickname is not in priority
+        # Return -1 if class_name is not mapped
         if class_name not in class_name_to_nickname or class_name_to_nickname[class_name] not in sampler_priority:
-            return float('inf')
+            return -1
 
-        # Return the index of the nickname in the priority list for sorting
         return sampler_priority.index(class_name_to_nickname[class_name])
 
     # Sort the list using the custom key function

From 9ca0cd7749f734bd5e7083d672dee8d400229257 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 29 Sep 2024 20:47:04 -0700
Subject: [PATCH 29/41] Bump llama-cpp-python to 0.3.1

---
 requirements.txt                 | 24 ++++++++++++------------
 requirements_amd.txt             | 12 ++++++------
 requirements_amd_noavx2.txt      |  8 ++++----
 requirements_apple_intel.txt     |  8 ++++----
 requirements_apple_silicon.txt   | 12 ++++++------
 requirements_cpu_only.txt        |  8 ++++----
 requirements_cpu_only_noavx2.txt |  8 ++++----
 requirements_noavx2.txt          | 24 ++++++++++++------------
 8 files changed, 52 insertions(+), 52 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index c7ba213f..5623f1f7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -34,22 +34,22 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, no tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.0+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.0+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.0+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.0+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.1+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.1+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.1+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.1+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.0+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.0+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.0+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.0+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.1+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.1+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.1+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.1+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # CUDA wheels
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.1-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements_amd.txt b/requirements_amd.txt
index 6ce5eb62..d203c157 100644
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@@ -33,14 +33,14 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.3.0+rocm6.1.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.3.0+rocm6.1.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.3.1+rocm6.1.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.3.1+rocm6.1.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+rocm6.1.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+rocm6.1.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt
index 81ad6b23..3a7a950d 100644
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@@ -33,10 +33,10 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # AMD wheels
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+rocm6.1.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt
index 6c3a696a..61c5367f 100644
--- a/requirements_apple_intel.txt
+++ b/requirements_apple_intel.txt
@@ -33,8 +33,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.0-cp311-cp311-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.0-cp310-cp310-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.0-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.0-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.1-cp311-cp311-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.1-cp310-cp310-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.1-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.1-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3-py3-none-any.whl
diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt
index 86ef7197..494c127a 100644
--- a/requirements_apple_silicon.txt
+++ b/requirements_apple_silicon.txt
@@ -33,10 +33,10 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.0-cp311-cp311-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.0-cp310-cp310-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.0-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.0-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.0-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.0-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.1-cp311-cp311-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.1-cp310-cp310-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.1-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.1-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.1-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.1-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3-py3-none-any.whl
diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt
index 4c970b91..5838ac47 100644
--- a/requirements_cpu_only.txt
+++ b/requirements_cpu_only.txt
@@ -33,7 +33,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt
index 4b22cb86..97d4eb90 100644
--- a/requirements_cpu_only_noavx2.txt
+++ b/requirements_cpu_only_noavx2.txt
@@ -33,7 +33,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index 516b0639..d34a598d 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -34,22 +34,22 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.0+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, no tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.0+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.0+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.0+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.0+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.1+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.1+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.1+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.1+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.0+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.0+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.0+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.0+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.1+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.1+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.1+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.1+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # CUDA wheels
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.1-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"

From 4d9ce586d31450d2ed692b1e7a72dbcdfd56e670 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 30 Sep 2024 14:04:21 -0700
Subject: [PATCH 30/41] Update llama_cpp_python_hijack.py, fix llamacpp_hf

---
 modules/llama_cpp_python_hijack.py | 19 ++++++++++++-------
 modules/llamacpp_hf.py             |  3 ++-
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/modules/llama_cpp_python_hijack.py b/modules/llama_cpp_python_hijack.py
index 3d42b2d7..2a9c10da 100644
--- a/modules/llama_cpp_python_hijack.py
+++ b/modules/llama_cpp_python_hijack.py
@@ -2,12 +2,12 @@ import importlib
 import platform
 from typing import Sequence
 
+import numpy as np
 from tqdm import tqdm
 
 from modules import shared
 from modules.cache_utils import process_llamacpp_cache
 
-
 imported_module = None
 
 
@@ -57,8 +57,6 @@ def eval_with_progress(self, tokens: Sequence[int]):
 
     with tqdm to show prompt processing progress.
     """
-    assert self._ctx.ctx is not None
-    assert self._batch.batch is not None
     self._ctx.kv_cache_seq_rm(-1, self.n_tokens, -1)
 
     if len(tokens) > self.n_batch:
@@ -80,13 +78,20 @@ def eval_with_progress(self, tokens: Sequence[int]):
         if self.context_params.logits_all:
             rows = n_tokens
             cols = self._n_vocab
-            logits = self._ctx.get_logits()[: rows * cols]
-            self.scores[n_past : n_past + n_tokens, :].reshape(-1)[: :] = logits
+            logits = np.ctypeslib.as_array(
+                self._ctx.get_logits(), shape=(rows * cols,)
+            )
+            self.scores[n_past : n_past + n_tokens, :].reshape(-1)[::] = logits
+            self.last_updated_index = n_past + n_tokens - 1
         else:
             rows = 1
             cols = self._n_vocab
-            logits = self._ctx.get_logits()[: rows * cols]
-            self.scores[n_past + n_tokens - 1, :].reshape(-1)[: :] = logits
+            logits = np.ctypeslib.as_array(
+                self._ctx.get_logits(), shape=(rows * cols,)
+            )
+            last_token_index = min(n_past + n_tokens - 1, self.scores.shape[0] - 1)
+            self.scores[last_token_index, :] = logits.reshape(-1)
+            self.last_updated_index = last_token_index
         # Update n_tokens
         self.n_tokens += n_tokens
 
diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py
index 327e3a7b..6611a7c1 100644
--- a/modules/llamacpp_hf.py
+++ b/modules/llamacpp_hf.py
@@ -127,7 +127,7 @@ class LlamacppHF(PreTrainedModel):
                 self.model.reset()
                 self.model.eval(seq)
 
-            logits = torch.tensor(self.model.scores[self.model.n_tokens - 1, :]).view(1, 1, -1).to(input_ids.device)
+            logits = torch.tensor(self.model.scores[self.model.last_updated_index, :]).view(1, 1, -1).to(input_ids.device)
         else:
             self.model.reset()
             self.model.eval(seq)
@@ -205,5 +205,6 @@ class LlamacppHF(PreTrainedModel):
 
         Llama = llama_cpp_lib().Llama
         model = Llama(**params)
+        model.last_updated_index = -1
 
         return LlamacppHF(model, model_file)

From 6063a6641444da58dc6f38626bbd639cd114eeb8 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 30 Sep 2024 18:50:38 -0300
Subject: [PATCH 31/41] Update accelerate requirement from ==0.33.* to ==0.34.*
 (#6416)

---
 requirements.txt                 | 2 +-
 requirements_amd.txt             | 2 +-
 requirements_amd_noavx2.txt      | 2 +-
 requirements_apple_intel.txt     | 2 +-
 requirements_apple_silicon.txt   | 2 +-
 requirements_cpu_only.txt        | 2 +-
 requirements_cpu_only_noavx2.txt | 2 +-
 requirements_noavx2.txt          | 2 +-
 requirements_nowheels.txt        | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 5623f1f7..f623372f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-accelerate==0.33.*
+accelerate==0.34.*
 bitsandbytes==0.44.*
 colorama
 datasets
diff --git a/requirements_amd.txt b/requirements_amd.txt
index d203c157..67e63632 100644
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@@ -1,4 +1,4 @@
-accelerate==0.33.*
+accelerate==0.34.*
 colorama
 datasets
 einops
diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt
index 3a7a950d..9d1828d6 100644
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@@ -1,4 +1,4 @@
-accelerate==0.33.*
+accelerate==0.34.*
 colorama
 datasets
 einops
diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt
index 61c5367f..7e279163 100644
--- a/requirements_apple_intel.txt
+++ b/requirements_apple_intel.txt
@@ -1,4 +1,4 @@
-accelerate==0.33.*
+accelerate==0.34.*
 colorama
 datasets
 einops
diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt
index 494c127a..1f476657 100644
--- a/requirements_apple_silicon.txt
+++ b/requirements_apple_silicon.txt
@@ -1,4 +1,4 @@
-accelerate==0.33.*
+accelerate==0.34.*
 colorama
 datasets
 einops
diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt
index 5838ac47..fe671920 100644
--- a/requirements_cpu_only.txt
+++ b/requirements_cpu_only.txt
@@ -1,4 +1,4 @@
-accelerate==0.33.*
+accelerate==0.34.*
 colorama
 datasets
 einops
diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt
index 97d4eb90..32bfe9c5 100644
--- a/requirements_cpu_only_noavx2.txt
+++ b/requirements_cpu_only_noavx2.txt
@@ -1,4 +1,4 @@
-accelerate==0.33.*
+accelerate==0.34.*
 colorama
 datasets
 einops
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index d34a598d..85f94eef 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -1,4 +1,4 @@
-accelerate==0.33.*
+accelerate==0.34.*
 bitsandbytes==0.44.*
 colorama
 datasets
diff --git a/requirements_nowheels.txt b/requirements_nowheels.txt
index 5c840f56..5e566798 100644
--- a/requirements_nowheels.txt
+++ b/requirements_nowheels.txt
@@ -1,4 +1,4 @@
-accelerate==0.33.*
+accelerate==0.34.*
 colorama
 datasets
 einops

From 617cd7b70514e97a96c365bb5eda481594d48028 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 1 Oct 2024 09:06:25 -0700
Subject: [PATCH 32/41] Revert "Update accelerate requirement from ==0.33.* to
 ==0.34.* (#6416)"

This reverts commit 6063a6641444da58dc6f38626bbd639cd114eeb8.
---
 requirements.txt                 | 2 +-
 requirements_amd.txt             | 2 +-
 requirements_amd_noavx2.txt      | 2 +-
 requirements_apple_intel.txt     | 2 +-
 requirements_apple_silicon.txt   | 2 +-
 requirements_cpu_only.txt        | 2 +-
 requirements_cpu_only_noavx2.txt | 2 +-
 requirements_noavx2.txt          | 2 +-
 requirements_nowheels.txt        | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index f623372f..5623f1f7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-accelerate==0.34.*
+accelerate==0.33.*
 bitsandbytes==0.44.*
 colorama
 datasets
diff --git a/requirements_amd.txt b/requirements_amd.txt
index 67e63632..d203c157 100644
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@@ -1,4 +1,4 @@
-accelerate==0.34.*
+accelerate==0.33.*
 colorama
 datasets
 einops
diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt
index 9d1828d6..3a7a950d 100644
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@@ -1,4 +1,4 @@
-accelerate==0.34.*
+accelerate==0.33.*
 colorama
 datasets
 einops
diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt
index 7e279163..61c5367f 100644
--- a/requirements_apple_intel.txt
+++ b/requirements_apple_intel.txt
@@ -1,4 +1,4 @@
-accelerate==0.34.*
+accelerate==0.33.*
 colorama
 datasets
 einops
diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt
index 1f476657..494c127a 100644
--- a/requirements_apple_silicon.txt
+++ b/requirements_apple_silicon.txt
@@ -1,4 +1,4 @@
-accelerate==0.34.*
+accelerate==0.33.*
 colorama
 datasets
 einops
diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt
index fe671920..5838ac47 100644
--- a/requirements_cpu_only.txt
+++ b/requirements_cpu_only.txt
@@ -1,4 +1,4 @@
-accelerate==0.34.*
+accelerate==0.33.*
 colorama
 datasets
 einops
diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt
index 32bfe9c5..97d4eb90 100644
--- a/requirements_cpu_only_noavx2.txt
+++ b/requirements_cpu_only_noavx2.txt
@@ -1,4 +1,4 @@
-accelerate==0.34.*
+accelerate==0.33.*
 colorama
 datasets
 einops
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index 85f94eef..d34a598d 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -1,4 +1,4 @@
-accelerate==0.34.*
+accelerate==0.33.*
 bitsandbytes==0.44.*
 colorama
 datasets
diff --git a/requirements_nowheels.txt b/requirements_nowheels.txt
index 5e566798..5c840f56 100644
--- a/requirements_nowheels.txt
+++ b/requirements_nowheels.txt
@@ -1,4 +1,4 @@
-accelerate==0.34.*
+accelerate==0.33.*
 colorama
 datasets
 einops

From c6b50f88dab6435a0c27e5836b33f5b790c072f0 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 1 Oct 2024 10:19:28 -0700
Subject: [PATCH 33/41] Lint

---
 requirements.txt            | 2 +-
 requirements_amd.txt        | 2 +-
 requirements_amd_noavx2.txt | 2 +-
 requirements_noavx2.txt     | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 6d939275..5623f1f7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -60,4 +60,4 @@ https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3-
 https://github.com/oobabooga/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu122torch2.4.1cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu122torch2.4.1cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
\ No newline at end of file
+https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
diff --git a/requirements_amd.txt b/requirements_amd.txt
index 7741849b..d203c157 100644
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@@ -43,4 +43,4 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/ro
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.3.1+rocm6.1.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+rocm6.1.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+rocm6.1.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
\ No newline at end of file
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt
index e4765488..3a7a950d 100644
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@@ -41,4 +41,4 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cp
 # AMD wheels
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+rocm6.1.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+rocm6.1.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
\ No newline at end of file
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index edeae78d..d34a598d 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -60,4 +60,4 @@ https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3-
 https://github.com/oobabooga/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu122torch2.4.1cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu122torch2.4.1cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
\ No newline at end of file
+https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"

From cca9d6e22d6cb3e2909121f6d6077af36b2116f5 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 1 Oct 2024 10:21:06 -0700
Subject: [PATCH 34/41] Lint

---
 modules/text_generation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/text_generation.py b/modules/text_generation.py
index 654eff89..86245098 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -274,10 +274,10 @@ def get_reply_from_output_ids(output_ids, state=None, starting_from=0):
     if (hasattr(shared.tokenizer, 'convert_ids_to_tokens') and len(output_ids) > starting_from) and not reply.startswith(' '):
         first_token = shared.tokenizer.convert_ids_to_tokens(int(output_ids[starting_from]))
         if isinstance(first_token, (bytes,)):
-            #try to decode the bytes to a string
+            # try to decode the bytes to a string
+            # if it fails, which means it's not a string in this turn, just ignore it
             try:
                 first_token = first_token.decode('utf8')
-            #if it fails, which means it's not a string in this turn, just ignore it
             except UnicodeDecodeError:
                 first_token = ''
 

From d364aa0a3c0993b09d53966b53eae862f4deaae0 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 1 Oct 2024 10:22:57 -0700
Subject: [PATCH 35/41] Lint

---
 js/main.js | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/js/main.js b/js/main.js
index 71489f14..3028afac 100644
--- a/js/main.js
+++ b/js/main.js
@@ -604,11 +604,11 @@ headerBar.addEventListener("click", (e) => {
 // Add a confirmation dialog when leaving the page
 // Useful to avoid data loss
 //------------------------------------------------
-window.addEventListener('beforeunload', function (event) {
+window.addEventListener("beforeunload", function (event) {
   // Cancel the event
   event.preventDefault();
   // Chrome requires returnValue to be set
-  event.returnValue = '';
+  event.returnValue = "";
 });
 
 moveToChatTab();

From 93c250b9b6b7d625c9cd698f99400d4fe650779d Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 1 Oct 2024 11:16:15 -0700
Subject: [PATCH 36/41] Add a UI element for enable_tp

---
 modules/loaders.py       | 2 ++
 modules/ui.py            | 1 +
 modules/ui_model_menu.py | 1 +
 3 files changed, 4 insertions(+)

diff --git a/modules/loaders.py b/modules/loaders.py
index 16a3106e..deee00a7 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -90,6 +90,7 @@ loaders_and_params = OrderedDict({
         'cache_8bit',
         'cache_4bit',
         'autosplit',
+        'enable_tp',
         'alpha_value',
         'compress_pos_emb',
         'trust_remote_code',
@@ -105,6 +106,7 @@ loaders_and_params = OrderedDict({
         'cache_8bit',
         'cache_4bit',
         'autosplit',
+        'enable_tp',
         'alpha_value',
         'compress_pos_emb',
         'exllamav2_info',
diff --git a/modules/ui.py b/modules/ui.py
index 76b1c009..c07beeb4 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -90,6 +90,7 @@ def list_model_elements():
         'cache_8bit',
         'cache_4bit',
         'autosplit',
+        'enable_tp',
         'threads',
         'threads_batch',
         'n_batch',
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 1883fdca..f87b680a 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -136,6 +136,7 @@ def create_ui():
                             shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)
                             shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16)
                             shared.gradio['autosplit'] = gr.Checkbox(label="autosplit", value=shared.args.autosplit, info='Automatically split the model tensors across the available GPUs.')
+                            shared.gradio['enable_tp'] = gr.Checkbox(label="enable_tp", value=shared.args.enable_tp, info='Enable Tensor Parallelism (TP).')
                             shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn)
                             shared.gradio['no_xformers'] = gr.Checkbox(label="no_xformers", value=shared.args.no_xformers)
                             shared.gradio['no_sdpa'] = gr.Checkbox(label="no_sdpa", value=shared.args.no_sdpa)

From 49dfa0adaf52bcff76b3ab9c33326812b281c2cf Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 1 Oct 2024 11:20:48 -0700
Subject: [PATCH 37/41] Fix the "save preset" event

---
 modules/ui_file_saving.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ui_file_saving.py b/modules/ui_file_saving.py
index ac72c623..3a27e1b9 100644
--- a/modules/ui_file_saving.py
+++ b/modules/ui_file_saving.py
@@ -74,7 +74,7 @@ def handle_save_preset_confirm_click(filename, contents):
     try:
         utils.save_file(f"presets/{filename}.yaml", contents)
         available_presets = utils.get_available_presets()
-        output = gr.update(choices=available_presets, value=filename),
+        output = gr.update(choices=available_presets, value=filename)
     except Exception:
         output = gr.update()
         traceback.print_exc()

From e1338a1804d51d4e74146f2490586985ba499ce0 Mon Sep 17 00:00:00 2001
From: SeanScripts <64337075+SeanScripts@users.noreply.github.com>
Date: Tue, 1 Oct 2024 13:49:35 -0700
Subject: [PATCH 38/41] Add whisper turbo (#6423)

---
 extensions/whisper_stt/script.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extensions/whisper_stt/script.py b/extensions/whisper_stt/script.py
index e45c8b1e..d949e93f 100644
--- a/extensions/whisper_stt/script.py
+++ b/extensions/whisper_stt/script.py
@@ -96,7 +96,7 @@ def ui():
             with gr.Accordion("Settings", open=False):
                 auto_submit = gr.Checkbox(label='Submit the transcribed audio automatically', value=params['auto_submit'])
                 device_dropd = gr.Dropdown(label='Device', value=str(startup_device), choices=["cuda", "cpu", "none"])
-                whisper_model_dropd = gr.Dropdown(label='Whisper Model', value=params['whipser_model'], choices=["tiny.en", "base.en", "small.en", "medium.en", "tiny", "base", "small", "medium", "large"])
+                whisper_model_dropd = gr.Dropdown(label='Whisper Model', value=params['whipser_model'], choices=["tiny.en", "base.en", "small.en", "medium.en", "tiny", "base", "small", "medium", "large", "turbo"])
                 whisper_language = gr.Dropdown(label='Whisper Language', value=params['whipser_language'], choices=["english", "chinese", "german", "spanish", "russian", "korean", "french", "japanese", "portuguese", "turkish", "polish", "catalan", "dutch", "arabic", "swedish", "italian", "indonesian", "hindi", "finnish", "vietnamese", "hebrew", "ukrainian", "greek", "malay", "czech", "romanian", "danish", "hungarian", "tamil", "norwegian", "thai", "urdu", "croatian", "bulgarian", "lithuanian", "latin", "maori", "malayalam", "welsh", "slovak", "telugu", "persian", "latvian", "bengali", "serbian", "azerbaijani", "slovenian", "kannada", "estonian", "macedonian", "breton", "basque", "icelandic", "armenian", "nepali", "mongolian", "bosnian", "kazakh", "albanian", "swahili", "galician", "marathi", "punjabi", "sinhala", "khmer", "shona", "yoruba", "somali", "afrikaans", "occitan", "georgian", "belarusian", "tajik", "sindhi", "gujarati", "amharic", "yiddish", "lao", "uzbek", "faroese", "haitian creole", "pashto", "turkmen", "nynorsk", "maltese", "sanskrit", "luxembourgish", "myanmar", "tibetan", "tagalog", "malagasy", "assamese", "tatar", "hawaiian", "lingala", "hausa", "bashkir", "javanese", "sundanese"])
 
     audio.change(

From 22baa5378f27de6427640a4ce3ba3a54bb3a9722 Mon Sep 17 00:00:00 2001
From: Luana <luana@luana.dev.br>
Date: Thu, 3 Oct 2024 00:35:13 -0300
Subject: [PATCH 39/41] Fix for systems that have bash in a non-standard
 directory (#6428)

---
 one_click.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/one_click.py b/one_click.py
index 9bad25d1..f1c91f47 100644
--- a/one_click.py
+++ b/one_click.py
@@ -189,8 +189,8 @@ def run_cmd(cmd, assert_success=False, environment=False, capture_output=False,
             conda_sh_path = os.path.join(script_dir, "installer_files", "conda", "etc", "profile.d", "conda.sh")
             cmd = f'. "{conda_sh_path}" && conda activate "{conda_env_path}" && {cmd}'
 
-    # Set executable to None for Windows, /bin/bash for everything else
-    executable = None if is_windows() else '/bin/bash'
+    # Set executable to None for Windows, bash for everything else
+    executable = None if is_windows() else 'bash'
 
     # Run shell commands
     result = subprocess.run(cmd, shell=True, capture_output=capture_output, env=env, executable=executable)

From 9d8b1c5fd94d360da4ab55334442cfa64aaf40d0 Mon Sep 17 00:00:00 2001
From: Grzegorz Lippe <grzegorz.lippe@googlemail.com>
Date: Sat, 5 Oct 2024 16:58:17 +0200
Subject: [PATCH 40/41] Fix intel bug described in #6253 (#6433)

---
 one_click.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/one_click.py b/one_click.py
index f1c91f47..8fc1edf0 100644
--- a/one_click.py
+++ b/one_click.py
@@ -313,7 +313,7 @@ def install_webui():
     if selected_gpu == "INTEL":
         # Install oneAPI dependencies via conda
         print_big_message("Installing Intel oneAPI runtime libraries.")
-        run_cmd("conda install -y -c intel dpcpp-cpp-rt=2024.0 mkl-dpcpp=2024.0")
+        run_cmd("conda install -y -c https://software.repos.intel.com/python/conda/ -c conda-forge dpcpp-cpp-rt=2024.0 mkl-dpcpp=2024.0")
         # Install libuv required by Intel-patched torch
         run_cmd("conda install -y libuv")
 
@@ -329,7 +329,7 @@ def install_extensions_requirements():
     print_big_message("Installing extensions requirements.\nSome of these may fail on Windows.\nDon\'t worry if you see error messages, as they will not affect the main program.")
     extensions = get_extensions_names()
     for i, extension in enumerate(extensions):
-        print(f"\n\n--- [{i+1}/{len(extensions)}]: {extension}\n\n")
+        print(f"\n\n--- [{i + 1}/{len(extensions)}]: {extension}\n\n")
         extension_req_path = os.path.join("extensions", extension, "requirements.txt")
         run_cmd(f"python -m pip install -r {extension_req_path} --upgrade", assert_success=False, environment=True)
 

From 03a2e7005412efb7765ce3f0bfccd6b94e79a843 Mon Sep 17 00:00:00 2001
From: PIRI <34787507+ThisIsPIRI@users.noreply.github.com>
Date: Wed, 9 Oct 2024 14:25:14 +0000
Subject: [PATCH 41/41] Fix temperature_last when temperature not in sampler
 priority (#6439)

---
 modules/sampler_hijack.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/modules/sampler_hijack.py b/modules/sampler_hijack.py
index 87f0b25e..97951dd2 100644
--- a/modules/sampler_hijack.py
+++ b/modules/sampler_hijack.py
@@ -571,11 +571,10 @@ def get_logits_processor_patch(self, **kwargs):
     if generation_config.temperature_last:
         for param_name in ['temperature', 'dynamic_temperature', 'quadratic_sampling']:
             if param_name in sampler_priority:
-                if param_name in sampler_priority:
-                    index = sampler_priority.index(param_name)
-                    sampler_priority.append(sampler_priority.pop(index))
-                else:
-                    sampler_priority.append(param_name)
+                index = sampler_priority.index(param_name)
+                sampler_priority.append(sampler_priority.pop(index))
+            else:
+                sampler_priority.append(param_name)
 
     class_name_to_nickname = {
         'DynamicTemperatureLogitsWarper': 'dynamic_temperature',