Add ExLlamaV2 and ExLlamav2_HF loaders (#3881)

2024-11-25 17:29:22 +01:00 · 2023-09-12 14:33:07 -03:00 · 2023-09-12 14:33:07 -03:00 · c2a309f56e
commit c2a309f56e
parent a821928877
9 changed files with 295 additions and 5 deletions
--- a/models/config.yaml
+++ b/models/config.yaml
@ -210,7 +210,7 @@ llama-65b-gptq-3bit:
  instruction_template: 'Alpaca'
 .*llama-(2|v2):
  truncation_length: 4096
-.*llama-(2|v2).*chat:
+.*llama(-?)(2|v2).*chat:
  instruction_template: 'Llama-v2'
 .*newhope:
  instruction_template: 'NewHope'
--- a/modules/exllamav2.py
+++ b/modules/exllamav2.py
@ -0,0 +1,102 @@
 import random
 from pathlib import Path
 import torch
 from exllamav2 import (
    ExLlamaV2,
    ExLlamaV2Cache,
    ExLlamaV2Config,
    ExLlamaV2Tokenizer
 )
 from exllamav2.generator import ExLlamaV2BaseGenerator, ExLlamaV2Sampler
 from modules import shared
 from modules.text_generation import get_max_prompt_length
 class Exllamav2Model:
    def __init__(self):
        pass
    @classmethod
    def from_pretrained(self, path_to_model):
        path_to_model = Path(f'{shared.args.model_dir}') / Path(path_to_model)
        config = ExLlamaV2Config()
        config.model_dir = path_to_model
        config.prepare()
        config.max_seq_len = shared.args.max_seq_len
        model = ExLlamaV2(config)
        split = None
        if shared.args.gpu_split:
            split = [float(alloc) for alloc in shared.args.gpu_split.split(",")]
        model.load(split)
        tokenizer = ExLlamaV2Tokenizer(config)
        cache = ExLlamaV2Cache(model)
        generator = ExLlamaV2BaseGenerator(model, cache, tokenizer)
        result = self()
        result.model = model
        result.cache = cache
        result.tokenizer = tokenizer
        result.generator = generator
        return result, tokenizer
    def generate_with_streaming(self, prompt, state):
        settings = ExLlamaV2Sampler.Settings()
        settings.temperature = state['temperature']
        settings.top_k = state['top_k']
        settings.top_p = state['top_p']
        settings.token_repetition_penalty = state['repetition_penalty']
        settings.token_repetition_range = -1 if state['repetition_penalty_range'] <= 0 else state['repetition_penalty_range']
        if state['ban_eos_token']:
            settings.disallow_tokens(self.tokenizer, [self.tokenizer.eos_token_id])
        ids = self.tokenizer.encode(prompt)
        ids = ids[:, -get_max_prompt_length(state):]
        initial_len = ids.shape[-1]
        if state['auto_max_new_tokens']:
            max_new_tokens = state['truncation_length'] - ids.shape[-1]
        else:
            max_new_tokens = state['max_new_tokens']
        # _gen_begin_base
        self.cache.current_seq_len = 0
        self.model.forward(ids[:, :-1], self.cache, input_mask=None, preprocess_only=True)
        has_leading_space = False
        for i in range(max_new_tokens):
            logits = self.model.forward(ids[:, -1:], self.cache, input_mask=None).float().cpu()
            token, _ = ExLlamaV2Sampler.sample(logits, settings, ids, random.random())
            ids = torch.cat([ids, token], dim=1)
            if i == 0 and self.tokenizer.tokenizer.IdToPiece(int(token)).startswith('▁'):
                has_leading_space = True
            decoded_text = self.tokenizer.decode(ids[:, initial_len:])[0]
            if has_leading_space:
                decoded_text = ' ' + decoded_text
            yield decoded_text
            if token.item() == self.tokenizer.eos_token_id or shared.stop_everything:
                break
    def generate(self, prompt, state):
        output = ''
        for output in self.generate_with_streaming(prompt, state):
            pass
        return output
    def encode(self, string, **kwargs):
        return self.tokenizer.encode(string)
    def decode(self, string, **kwargs):
        return self.tokenizer.decode(string)[0]
--- a/modules/exllamav2_hf.py
+++ b/modules/exllamav2_hf.py
@ -0,0 +1,119 @@
 import os
 from pathlib import Path
 from typing import Any, Dict, Optional, Union
 import torch
 from exllamav2 import ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Config
 from torch.nn import CrossEntropyLoss
 from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from modules import shared
 from modules.logging_colors import logger
 class Exllamav2HF(PreTrainedModel):
    def __init__(self, config: ExLlamaV2Config):
        super().__init__(PretrainedConfig())
        self.ex_config = config
        self.ex_model = ExLlamaV2(config)
        split = None
        if shared.args.gpu_split:
            split = [float(alloc) for alloc in shared.args.gpu_split.split(",")]
        self.ex_model.load(split)
        self.generation_config = GenerationConfig()
        self.ex_cache = ExLlamaV2Cache(self.ex_model)
        self.past_seq = None
        if shared.args.cfg_cache:
            self.ex_cache_negative = ExLlamaV2Cache(self.ex_model)
            self.past_seq_negative = None
    def _validate_model_class(self):
        pass
    def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]):
        pass
    def prepare_inputs_for_generation(self, input_ids, **kwargs):
        return {'input_ids': input_ids, **kwargs}
    @property
    def device(self) -> torch.device:
        return torch.device(0)
    def __call__(self, *args, **kwargs):
        use_cache = kwargs.get('use_cache', True)
        labels = kwargs.get('labels', None)
        past_key_values = kwargs.get('past_key_values', None)
        if len(args) > 0:
            if not shared.args.cfg_cache:
                logger.error("Please enable the cfg-cache option to use CFG with ExLlamav2_HF.")
                return
            input_ids = args[0]
            is_negative = True
            past_seq = self.past_seq_negative
            ex_cache = self.ex_cache_negative
        else:
            input_ids = kwargs['input_ids']
            is_negative = False
            past_seq = self.past_seq
            ex_cache = self.ex_cache
        seq = input_ids[0].tolist()
        if is_negative and past_key_values is not None:
            seq = past_key_values + seq
        seq_tensor = torch.tensor(seq)
        # Make the forward call
        if labels is None:
            if past_seq is None or not torch.equal(past_seq, seq_tensor[:-1]):
                ex_cache.current_seq_len = 0
                self.ex_model.forward(torch.tensor([seq[:-1]], dtype=torch.long), ex_cache, preprocess_only=True)
            logits = self.ex_model.forward(torch.tensor([seq[-1:]], dtype=torch.long), ex_cache).to(input_ids.device)
        else:
            ex_cache.current_seq_len = 0
            # logits = self.ex_model.forward(torch.tensor([seq], dtype=torch.long), ex_cache, last_id_only=False)
            logits = self.ex_model.forward(torch.tensor([seq], dtype=torch.long), ex_cache)
        if is_negative:
            self.past_seq_negative = seq_tensor
        else:
            self.past_seq = seq_tensor
        loss = None
        if labels is not None:
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = CrossEntropyLoss()
            shift_logits = shift_logits.view(-1, logits.shape[-1])
            shift_labels = shift_labels.view(-1)
            # Enable model parallelism
            shift_labels = shift_labels.to(shift_logits.device)
            loss = loss_fct(shift_logits, shift_labels)
        return CausalLMOutputWithPast(logits=logits, past_key_values=seq if use_cache else None, loss=loss)
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):
        assert len(model_args) == 0 and len(kwargs) == 0, "extra args is currently not supported"
        if isinstance(pretrained_model_name_or_path, str):
            pretrained_model_name_or_path = Path(pretrained_model_name_or_path)
        pretrained_model_name_or_path = Path(f'{shared.args.model_dir}') / Path(pretrained_model_name_or_path)
        config = ExLlamaV2Config()
        config.model_dir = pretrained_model_name_or_path
        config.prepare()
        config.max_seq_len = shared.args.max_seq_len
        return Exllamav2HF(config)
--- a/modules/loaders.py
+++ b/modules/loaders.py
@ -42,6 +42,15 @@ loaders_and_params = OrderedDict({
        'compress_pos_emb',
        'exllama_info',
    ],
    'ExLlamav2': [
        'gpu_split',
        'max_seq_len',
    ],
    'ExLlamav2_HF': [
        'gpu_split',
        'max_seq_len',
        'cfg_cache',
    ],
    'AutoGPTQ': [
        'triton',
        'no_inject_fused_attention',
@ -180,6 +189,42 @@ loaders_samplers = {
        'ban_eos_token',
        'auto_max_new_tokens',
    },
    'ExLlamav2': {
        'temperature',
        'top_p',
        'top_k',
        'repetition_penalty',
        'repetition_penalty_range',
        'seed',
        'ban_eos_token',
        'auto_max_new_tokens',
    },
    'ExLlamav2_HF': {
        'temperature',
        'top_p',
        'top_k',
        'typical_p',
        'epsilon_cutoff',
        'eta_cutoff',
        'tfs',
        'top_a',
        'repetition_penalty',
        'repetition_penalty_range',
        'encoder_repetition_penalty',
        'no_repeat_ngram_size',
        'min_length',
        'seed',
        'do_sample',
        'mirostat_mode',
        'mirostat_tau',
        'mirostat_eta',
        'guidance_scale',
        'negative_prompt',
        'ban_eos_token',
        'add_bos_token',
        'skip_special_tokens',
        'auto_max_new_tokens',
    },
    'AutoGPTQ': {
        'temperature',
        'top_p',
--- a/modules/models.py
+++ b/modules/models.py
@ -59,6 +59,8 @@ def load_model(model_name, loader=None):
        'RWKV': RWKV_loader,
        'ExLlama': ExLlama_loader,
        'ExLlama_HF': ExLlama_HF_loader,
        'ExLlamav2': ExLlamav2_loader,
        'ExLlamav2_HF': ExLlamav2_HF_loader,
        'ctransformers': ctransformers_loader,
    }
@ -329,6 +331,19 @@ def ExLlama_HF_loader(model_name):
    return ExllamaHF.from_pretrained(model_name)
 def ExLlamav2_loader(model_name):
    from modules.exllamav2 import Exllamav2Model
    model, tokenizer = Exllamav2Model.from_pretrained(model_name)
    return model, tokenizer
 def ExLlamav2_HF_loader(model_name):
    from modules.exllamav2_hf import Exllamav2HF
    return Exllamav2HF.from_pretrained(model_name)
 def get_max_memory_dict():
    max_memory = {}
    if shared.args.gpu_memory:
--- a/modules/shared.py
+++ b/modules/shared.py
@ -219,6 +219,10 @@ def fix_loader_name(name):
        return 'ExLlama'
    elif name in ['exllama-hf', 'exllama_hf', 'exllama hf', 'ex-llama-hf', 'ex_llama_hf']:
        return 'ExLlama_HF'
    elif name in ['exllamav2', 'exllama-v2', 'ex_llama-v2', 'exlamav2', 'exlama-v2']:
        return 'ExLlamav2'
    elif name in ['exllamav2-hf', 'exllamav2_hf', 'exllama-v2-hf', 'exllama_v2_hf', 'exllama-v2_hf']:
        return 'ExLlamav2_HF'
    elif name in ['ctransformers', 'ctranforemrs', 'ctransformer']:
        return 'ctransformers'
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@ -42,7 +42,7 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
            yield ''
            return
-        if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'ExllamaModel', 'CtransformersModel']:
+        if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'ExllamaModel', 'Exllamav2Model', 'CtransformersModel']:
            generate_func = generate_reply_custom
        else:
            generate_func = generate_reply_HF
@ -106,9 +106,10 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
 def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_length=None):
-    if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'CtransformersModel']:
+    if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'CtransformersModel', 'Exllamav2Model']:
        input_ids = shared.tokenizer.encode(str(prompt))
-        input_ids = np.array(input_ids).reshape(1, len(input_ids))
+        if shared.model.__class__.__name__ not in ['Exllamav2Model']:
            input_ids = np.array(input_ids).reshape(1, len(input_ids))
    else:
        input_ids = shared.tokenizer.encode(str(prompt), return_tensors='pt', add_special_tokens=add_special_tokens)
@ -120,7 +121,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
    if truncation_length is not None:
        input_ids = input_ids[:, -truncation_length:]
-    if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'ExllamaModel', 'CtransformersModel'] or shared.args.cpu:
+    if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'ExllamaModel', 'Exllamav2Model', 'CtransformersModel'] or shared.args.cpu:
        return input_ids
    elif shared.args.deepspeed:
        return input_ids.to(device=local_rank)
--- a/requirements.txt
+++ b/requirements.txt
@ -8,7 +8,9 @@ accelerate==0.22.*
 colorama
 datasets
 einops
 exllamav2==0.0.0
 markdown
 ninja
 numpy==1.24
 optimum==1.12.0
 pandas
--- a/requirements_nocuda.txt
+++ b/requirements_nocuda.txt
@ -8,7 +8,9 @@ accelerate==0.22.*
 colorama
 datasets
 einops
 exllamav2==0.0.0
 markdown
 ninja
 numpy==1.24
 optimum==1.12.0
 pandas