From 03dc69edc5436b9426238fa626212dcffd9d62a3 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 19 Sep 2023 13:12:19 -0700 Subject: [PATCH] ExLlama_HF (v1 and v2) prefix matching --- modules/exllama_hf.py | 26 +++++++++++++++++++++----- modules/exllamav2_hf.py | 27 +++++++++++++++++++++------ 2 files changed, 42 insertions(+), 11 deletions(-) diff --git a/modules/exllama_hf.py b/modules/exllama_hf.py index 9e4701bf..3245ac87 100644 --- a/modules/exllama_hf.py +++ b/modules/exllama_hf.py @@ -77,17 +77,33 @@ class ExllamaHF(PreTrainedModel): seq = past_key_values + seq seq_tensor = torch.tensor(seq) + reset = True # Make the forward call if labels is None: - if past_seq is None or not torch.equal(past_seq, seq_tensor[:-1]): - ex_cache.current_seq_len = 0 - self.ex_model.forward(torch.tensor([seq[:-1]], dtype=torch.long), ex_cache, preprocess_only=True, lora=self.lora) + if past_seq is not None: + min_length = min(past_seq.shape[0], seq_tensor.shape[0]) + indices = torch.nonzero(~torch.eq(past_seq[:min_length], seq_tensor[:min_length])) + if len(indices) > 0: + longest_prefix = indices[0].item() + else: + longest_prefix = min_length - logits = self.ex_model.forward(torch.tensor([seq[-1:]], dtype=torch.long), ex_cache, lora=self.lora).to(input_ids.device) + if longest_prefix > 0: + reset = False + ex_cache.current_seq_len = longest_prefix + if len(seq_tensor) - longest_prefix > 1: + self.ex_model.forward(seq_tensor[longest_prefix:-1].view(1, -1), ex_cache, preprocess_only=True, lora=self.lora) + + if reset: + ex_cache.current_seq_len = 0 + if len(seq_tensor) > 1: + self.ex_model.forward(seq_tensor[:-1].view(1, -1), ex_cache, preprocess_only=True, lora=self.lora) + + logits = self.ex_model.forward(seq_tensor[-1:].view(1, -1), ex_cache, lora=self.lora).to(input_ids.device) else: ex_cache.current_seq_len = 0 - logits = self.ex_model.forward(torch.tensor([seq], dtype=torch.long), ex_cache, last_id_only=False, lora=self.lora) + logits = self.ex_model.forward(seq_tensor.view(1, -1), ex_cache, last_id_only=False, lora=self.lora) if is_negative: self.past_seq_negative = seq_tensor diff --git a/modules/exllamav2_hf.py b/modules/exllamav2_hf.py index 457942ac..6542ede9 100644 --- a/modules/exllamav2_hf.py +++ b/modules/exllamav2_hf.py @@ -81,18 +81,33 @@ class Exllamav2HF(PreTrainedModel): seq = past_key_values + seq seq_tensor = torch.tensor(seq) + reset = True # Make the forward call if labels is None: - if past_seq is None or not torch.equal(past_seq, seq_tensor[:-1]): - ex_cache.current_seq_len = 0 - self.ex_model.forward(torch.tensor([seq[:-1]], dtype=torch.long), ex_cache, preprocess_only=True) + if past_seq is not None: + min_length = min(past_seq.shape[0], seq_tensor.shape[0]) + indices = torch.nonzero(~torch.eq(past_seq[:min_length], seq_tensor[:min_length])) + if len(indices) > 0: + longest_prefix = indices[0].item() + else: + longest_prefix = min_length - logits = self.ex_model.forward(torch.tensor([seq[-1:]], dtype=torch.long), ex_cache).to(input_ids.device) + if longest_prefix > 0: + reset = False + ex_cache.current_seq_len = longest_prefix + if len(seq_tensor) - longest_prefix > 1: + self.ex_model.forward(seq_tensor[longest_prefix:-1].view(1, -1), ex_cache, preprocess_only=True) + + if reset: + ex_cache.current_seq_len = 0 + if len(seq_tensor) > 1: + self.ex_model.forward(seq_tensor[:-1].view(1, -1), ex_cache, preprocess_only=True) + + logits = self.ex_model.forward(seq_tensor[-1:].view(1, -1), ex_cache).to(input_ids.device) else: ex_cache.current_seq_len = 0 - # logits = self.ex_model.forward(torch.tensor([seq], dtype=torch.long), ex_cache, last_id_only=False) - logits = self.ex_model.forward(torch.tensor([seq], dtype=torch.long), ex_cache) + logits = self.ex_model.forward(seq_tensor.view(1, -1), ex_cache, last_id_only=False) if is_negative: self.past_seq_negative = seq_tensor