From b15f51015477c9709e2dff616c20466e9b3dc727 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 20 Dec 2023 07:31:42 -0800 Subject: [PATCH 1/4] Optimize ExLlamav2 (non-HF) loader --- modules/exllamav2.py | 36 ++++++++---------------------------- 1 file changed, 8 insertions(+), 28 deletions(-) diff --git a/modules/exllamav2.py b/modules/exllamav2.py index 2cf4a039..3a6b231a 100644 --- a/modules/exllamav2.py +++ b/modules/exllamav2.py @@ -1,4 +1,3 @@ -import random import traceback from pathlib import Path @@ -10,7 +9,7 @@ from exllamav2 import ( ExLlamaV2Config, ExLlamaV2Tokenizer ) -from exllamav2.generator import ExLlamaV2BaseGenerator, ExLlamaV2Sampler +from exllamav2.generator import ExLlamaV2Sampler, ExLlamaV2StreamingGenerator from modules import shared from modules.logging_colors import logger @@ -64,7 +63,7 @@ class Exllamav2Model: else: cache = ExLlamaV2Cache(model) - generator = ExLlamaV2BaseGenerator(model, cache, tokenizer) + generator = ExLlamaV2StreamingGenerator(model, cache, tokenizer) result = self() result.model = model @@ -115,41 +114,22 @@ class Exllamav2Model: ids = self.tokenizer.encode(prompt, add_bos=state['add_bos_token'], encode_special_tokens=True) ids = ids[:, -get_max_prompt_length(state):] - initial_len = ids.shape[-1] if state['auto_max_new_tokens']: max_new_tokens = state['truncation_length'] - ids.shape[-1] else: max_new_tokens = state['max_new_tokens'] - # _gen_begin_base - self.cache.current_seq_len = 0 - self.model.forward(ids[:, :-1], self.cache, input_mask=None, preprocess_only=True, loras=self.loras) + self.generator.set_stop_conditions([]) + self.generator.begin_stream(ids, settings, loras=self.loras) - has_leading_space = False + decoded_text = '' for i in range(max_new_tokens): - logits = self.model.forward(ids[:, -1:], self.cache, input_mask=None, loras=self.loras).float().cpu() - token, _, _ = ExLlamaV2Sampler.sample(logits, settings, ids, random.random(), self.tokenizer) - ids = torch.cat([ids, token], dim=1) - - if i == 0 and self.tokenizer.tokenizer.id_to_piece(int(token)).startswith('▁'): - has_leading_space = True - - decoded_text = self.tokenizer.decode(ids[:, initial_len:], decode_special_tokens=not state['skip_special_tokens'])[0] - if has_leading_space: - decoded_text = ' ' + decoded_text - - # Check the partial unicode character - if chr(0xfffd) in decoded_text: - is_last = i == max_new_tokens - 1 - is_stopping = token.item() == self.tokenizer.eos_token_id or shared.stop_everything - # If we are not at the end of the generation, we skip this token - if not (is_last or is_stopping): - continue - - if token.item() == self.tokenizer.eos_token_id or shared.stop_everything: + chunk, eos, _ = self.generator.stream() + if eos or shared.stop_everything: break + decoded_text += chunk yield decoded_text def generate(self, prompt, state): From f0f6d9bdf9f6fbd41965d619c9359de65c0a7d10 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 20 Dec 2023 07:36:33 -0800 Subject: [PATCH 2/4] Add HQQ back & update version This reverts commit 2289e9031e50326ddfae962db6e7f3cc6225077f. --- modules/models.py | 8 ++------ requirements.txt | 1 + requirements_amd.txt | 1 + requirements_amd_noavx2.txt | 1 + requirements_apple_intel.txt | 1 + requirements_apple_silicon.txt | 1 + requirements_cpu_only.txt | 1 + requirements_cpu_only_noavx2.txt | 1 + requirements_noavx2.txt | 1 + requirements_nowheels.txt | 1 + 10 files changed, 11 insertions(+), 6 deletions(-) diff --git a/modules/models.py b/modules/models.py index cad6a165..7a1124d1 100644 --- a/modules/models.py +++ b/modules/models.py @@ -413,12 +413,8 @@ def ExLlamav2_HF_loader(model_name): def HQQ_loader(model_name): - try: - from hqq.core.quantize import HQQBackend, HQQLinear - from hqq.engine.hf import HQQModelForCausalLM - except ModuleNotFoundError: - logger.error("HQQ is not installed. You can install it with:\n\npip install hqq") - return None + from hqq.core.quantize import HQQBackend, HQQLinear + from hqq.engine.hf import HQQModelForCausalLM logger.info(f"Loading HQQ model with backend: {shared.args.hqq_backend}") diff --git a/requirements.txt b/requirements.txt index 38f8efdb..4843741b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,7 @@ datasets einops exllamav2==0.0.11; platform_system != "Darwin" and platform_machine != "x86_64" gradio==3.50.* +hqq==0.1.1.post1 markdown numpy==1.24.* optimum==1.16.* diff --git a/requirements_amd.txt b/requirements_amd.txt index 458d810d..f15014ad 100644 --- a/requirements_amd.txt +++ b/requirements_amd.txt @@ -4,6 +4,7 @@ datasets einops exllamav2==0.0.11; platform_system == "Windows" or python_version < "3.10" or python_version > "3.11" or platform_machine != "x86_64" gradio==3.50.* +hqq==0.1.1.post1 markdown numpy==1.24.* optimum==1.16.* diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt index a4bb1551..843cbac1 100644 --- a/requirements_amd_noavx2.txt +++ b/requirements_amd_noavx2.txt @@ -4,6 +4,7 @@ datasets einops exllamav2==0.0.11; platform_system == "Windows" or python_version < "3.10" or python_version > "3.11" or platform_machine != "x86_64" gradio==3.50.* +hqq==0.1.1.post1 markdown numpy==1.24.* optimum==1.16.* diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt index 56eccd35..cee6d185 100644 --- a/requirements_apple_intel.txt +++ b/requirements_apple_intel.txt @@ -4,6 +4,7 @@ datasets einops exllamav2==0.0.11 gradio==3.50.* +hqq==0.1.1.post1 markdown numpy==1.24.* optimum==1.16.* diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt index 54606b01..a3aede26 100644 --- a/requirements_apple_silicon.txt +++ b/requirements_apple_silicon.txt @@ -4,6 +4,7 @@ datasets einops exllamav2==0.0.11 gradio==3.50.* +hqq==0.1.1.post1 markdown numpy==1.24.* optimum==1.16.* diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt index 09936b74..af04acf7 100644 --- a/requirements_cpu_only.txt +++ b/requirements_cpu_only.txt @@ -4,6 +4,7 @@ datasets einops exllamav2==0.0.11 gradio==3.50.* +hqq==0.1.1.post1 markdown numpy==1.24.* optimum==1.16.* diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt index 65734de4..1c9d15c0 100644 --- a/requirements_cpu_only_noavx2.txt +++ b/requirements_cpu_only_noavx2.txt @@ -4,6 +4,7 @@ datasets einops exllamav2==0.0.11 gradio==3.50.* +hqq==0.1.1.post1 markdown numpy==1.24.* optimum==1.16.* diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt index 6c8579f3..39751fc5 100644 --- a/requirements_noavx2.txt +++ b/requirements_noavx2.txt @@ -4,6 +4,7 @@ datasets einops exllamav2==0.0.11; platform_system != "Darwin" and platform_machine != "x86_64" gradio==3.50.* +hqq==0.1.1.post1 markdown numpy==1.24.* optimum==1.16.* diff --git a/requirements_nowheels.txt b/requirements_nowheels.txt index d9a689f9..22e10c6b 100644 --- a/requirements_nowheels.txt +++ b/requirements_nowheels.txt @@ -4,6 +4,7 @@ datasets einops exllamav2==0.0.11 gradio==3.50.* +hqq==0.1.1.post1 markdown numpy==1.24.* optimum==1.16.* From bcba200790bcf44164c83db7a8eb2c81e06285c1 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 20 Dec 2023 07:54:06 -0800 Subject: [PATCH 3/4] Fix EOS being ignored in ExLlamav2 after previous commit --- modules/exllamav2.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/exllamav2.py b/modules/exllamav2.py index 3a6b231a..2730d9f5 100644 --- a/modules/exllamav2.py +++ b/modules/exllamav2.py @@ -120,7 +120,6 @@ class Exllamav2Model: else: max_new_tokens = state['max_new_tokens'] - self.generator.set_stop_conditions([]) self.generator.begin_stream(ids, settings, loras=self.loras) decoded_text = '' From 6efbe3009fb7accb10de6f4777c8aa7edc5cb65e Mon Sep 17 00:00:00 2001 From: luna Date: Wed, 20 Dec 2023 13:29:19 -0300 Subject: [PATCH 4/4] let exllama v1 models load safetensor loras (#4854) --- modules/LoRA.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/modules/LoRA.py b/modules/LoRA.py index 97027eb4..be2a7c75 100644 --- a/modules/LoRA.py +++ b/modules/LoRA.py @@ -53,7 +53,10 @@ def add_lora_exllama(lora_names): lora_path = get_lora_path(lora_names[0]) lora_config_path = lora_path / "adapter_config.json" - lora_adapter_path = lora_path / "adapter_model.bin" + for file_name in ["adapter_model.safetensors", "adapter_model.bin"]: + file_path = lora_path / file_name + if file_path.is_file(): + lora_adapter_path = file_path logger.info("Applying the following LoRAs to {}: {}".format(shared.model_name, ', '.join([lora_names[0]]))) if shared.model.__class__.__name__ == 'ExllamaModel':