text-generation-webui/modules/exllamav2_hf.py

import os
import traceback
from pathlib import Path
from typing import Any, Dict, Optional, Union

import torch
from exllamav2 import ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Config
from torch.nn import CrossEntropyLoss
from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel
from transformers.modeling_outputs import CausalLMOutputWithPast

from modules import shared
from modules.logging_colors import logger

try:
    import flash_attn
except ModuleNotFoundError:
    logger.warning(
        'You are running ExLlamaV2 without flash-attention. This will cause the VRAM usage '
        'to be a lot higher than it could be.\n'
        'Try installing flash-attention following the instructions here: '
        'https://github.com/Dao-AILab/flash-attention#installation-and-features'
    )
    pass
except Exception:
    logger.warning('Failed to load flash-attention due to the following error:\n')
    traceback.print_exc()


class Exllamav2HF(PreTrainedModel):
    def __init__(self, config: ExLlamaV2Config):
        super().__init__(PretrainedConfig())
        self.ex_config = config
        self.ex_model = ExLlamaV2(config)
        split = None
        if shared.args.gpu_split:
            split = [float(alloc) for alloc in shared.args.gpu_split.split(",")]

        self.ex_model.load(split)
        self.generation_config = GenerationConfig()
        self.loras = None

        self.ex_cache = ExLlamaV2Cache(self.ex_model)
        self.past_seq = None

        if shared.args.cfg_cache:
            self.ex_cache_negative = ExLlamaV2Cache(self.ex_model)
            self.past_seq_negative = None

    def _validate_model_class(self):
        pass

    def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]):
        pass

    def prepare_inputs_for_generation(self, input_ids, **kwargs):
        return {'input_ids': input_ids, **kwargs}

    @property
    def device(self) -> torch.device:
        return torch.device(0)

    def __call__(self, *args, **kwargs):
        use_cache = kwargs.get('use_cache', True)
        labels = kwargs.get('labels', None)
        past_key_values = kwargs.get('past_key_values', None)

        if len(args) > 0:
            if not shared.args.cfg_cache:
                logger.error("Please enable the cfg-cache option to use CFG with ExLlamav2_HF.")
                return

            input_ids = args[0]
            is_negative = True
            past_seq = self.past_seq_negative
            ex_cache = self.ex_cache_negative
        else:
            input_ids = kwargs['input_ids']
            is_negative = False
            past_seq = self.past_seq
            ex_cache = self.ex_cache

        seq = input_ids[0].tolist()
        if is_negative and past_key_values is not None:
            seq = past_key_values + seq

        seq_tensor = torch.tensor(seq)
        reset = True

        # Make the forward call
        if labels is None:
            if past_seq is not None:
                min_length = min(past_seq.shape[0], seq_tensor.shape[0])
                indices = torch.nonzero(~torch.eq(past_seq[:min_length], seq_tensor[:min_length]))
                if len(indices) > 0:
                    longest_prefix = indices[0].item()
                else:
                    longest_prefix = min_length

                if longest_prefix > 0:
                    reset = False
                    ex_cache.current_seq_len = longest_prefix
                    if len(seq_tensor) - longest_prefix > 1:
                        self.ex_model.forward(seq_tensor[longest_prefix:-1].view(1, -1), ex_cache, preprocess_only=True, loras=self.loras)
                    elif len(seq_tensor) == longest_prefix:
                        # Very tricky: if the prefix we are reusing *is* the input_ids, then we have to back up the cache pointer by one,
                        # because we feed input_ids[-1] to forward() below, but that last token is already in the cache!
                        ex_cache.current_seq_len -= 1

            if reset:
                ex_cache.current_seq_len = 0
                if len(seq_tensor) > 1:
                    self.ex_model.forward(seq_tensor[:-1].view(1, -1), ex_cache, preprocess_only=True, loras=self.loras)

            logits = self.ex_model.forward(seq_tensor[-1:].view(1, -1), ex_cache, loras=self.loras).to(input_ids.device).float()
        else:
            ex_cache.current_seq_len = 0
            logits = self.ex_model.forward(seq_tensor.view(1, -1), ex_cache, last_id_only=False, loras=self.loras).float()

        if is_negative:
            self.past_seq_negative = seq_tensor
        else:
            self.past_seq = seq_tensor

        loss = None
        if labels is not None:
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = CrossEntropyLoss()
            shift_logits = shift_logits.view(-1, logits.shape[-1])
            shift_labels = shift_labels.view(-1)
            # Enable model parallelism
            shift_labels = shift_labels.to(shift_logits.device)
            loss = loss_fct(shift_logits, shift_labels)

        return CausalLMOutputWithPast(logits=logits, past_key_values=seq if use_cache else None, loss=loss)

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):
        assert len(model_args) == 0 and len(kwargs) == 0, "extra args is currently not supported"
        if isinstance(pretrained_model_name_or_path, str):
            pretrained_model_name_or_path = Path(pretrained_model_name_or_path)

        pretrained_model_name_or_path = Path(f'{shared.args.model_dir}') / Path(pretrained_model_name_or_path)

        config = ExLlamaV2Config()
        config.model_dir = str(pretrained_model_name_or_path)
        config.prepare()

        config.max_seq_len = shared.args.max_seq_len
        config.scale_pos_emb = shared.args.compress_pos_emb
        config.scale_alpha_value = shared.args.alpha_value

        return Exllamav2HF(config)
Add ExLlamaV2 and ExLlamav2_HF loaders (#3881) 2023-09-12 19:33:07 +02:00			`import os`
Add missing exception 2023-10-21 08:53:24 +02:00			`import traceback`
Add ExLlamaV2 and ExLlamav2_HF loaders (#3881) 2023-09-12 19:33:07 +02:00			`from pathlib import Path`
			`from typing import Any, Dict, Optional, Union`

			`import torch`
			`from exllamav2 import ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Config`
			`from torch.nn import CrossEntropyLoss`
			`from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel`
			`from transformers.modeling_outputs import CausalLMOutputWithPast`

			`from modules import shared`
			`from modules.logging_colors import logger`

Add a warning about ExLlamaV2 without flash-attn 2023-09-18 21:25:17 +02:00			`try:`
			`import flash_attn`
			`except ModuleNotFoundError:`
			`logger.warning(`
			`'You are running ExLlamaV2 without flash-attention. This will cause the VRAM usage '`
			`'to be a lot higher than it could be.\n'`
			`'Try installing flash-attention following the instructions here: '`
			`'https://github.com/Dao-AILab/flash-attention#installation-and-features'`
			`)`
			`pass`
Add missing exception 2023-10-21 08:53:24 +02:00			`except Exception:`
			`logger.warning('Failed to load flash-attention due to the following error:\n')`
			`traceback.print_exc()`
Add a warning about ExLlamaV2 without flash-attn 2023-09-18 21:25:17 +02:00
Add ExLlamaV2 and ExLlamav2_HF loaders (#3881) 2023-09-12 19:33:07 +02:00
			`class Exllamav2HF(PreTrainedModel):`
			`def __init__(self, config: ExLlamaV2Config):`
			`super().__init__(PretrainedConfig())`
			`self.ex_config = config`
			`self.ex_model = ExLlamaV2(config)`
			`split = None`
			`if shared.args.gpu_split:`
			`split = [float(alloc) for alloc in shared.args.gpu_split.split(",")]`

			`self.ex_model.load(split)`
			`self.generation_config = GenerationConfig()`
Exllamav2 lora support (#4229) --------- Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com> 2023-10-14 21:12:41 +02:00			`self.loras = None`
Add ExLlamaV2 and ExLlamav2_HF loaders (#3881) 2023-09-12 19:33:07 +02:00
			`self.ex_cache = ExLlamaV2Cache(self.ex_model)`
			`self.past_seq = None`

			`if shared.args.cfg_cache:`
			`self.ex_cache_negative = ExLlamaV2Cache(self.ex_model)`
			`self.past_seq_negative = None`

			`def _validate_model_class(self):`
			`pass`

			`def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]):`
			`pass`

			`def prepare_inputs_for_generation(self, input_ids, **kwargs):`
			`return {'input_ids': input_ids, **kwargs}`

			`@property`
			`def device(self) -> torch.device:`
			`return torch.device(0)`

			`def __call__(self, args, *kwargs):`
			`use_cache = kwargs.get('use_cache', True)`
			`labels = kwargs.get('labels', None)`
			`past_key_values = kwargs.get('past_key_values', None)`

			`if len(args) > 0:`
			`if not shared.args.cfg_cache:`
			`logger.error("Please enable the cfg-cache option to use CFG with ExLlamav2_HF.")`
			`return`

			`input_ids = args[0]`
			`is_negative = True`
			`past_seq = self.past_seq_negative`
			`ex_cache = self.ex_cache_negative`
			`else:`
			`input_ids = kwargs['input_ids']`
			`is_negative = False`
			`past_seq = self.past_seq`
			`ex_cache = self.ex_cache`

			`seq = input_ids[0].tolist()`
			`if is_negative and past_key_values is not None:`
			`seq = past_key_values + seq`

			`seq_tensor = torch.tensor(seq)`
ExLlama_HF (v1 and v2) prefix matching 2023-09-19 22:12:19 +02:00			`reset = True`
Add ExLlamaV2 and ExLlamav2_HF loaders (#3881) 2023-09-12 19:33:07 +02:00
			`# Make the forward call`
			`if labels is None:`
ExLlama_HF (v1 and v2) prefix matching 2023-09-19 22:12:19 +02:00			`if past_seq is not None:`
			`min_length = min(past_seq.shape[0], seq_tensor.shape[0])`
			`indices = torch.nonzero(~torch.eq(past_seq[:min_length], seq_tensor[:min_length]))`
			`if len(indices) > 0:`
			`longest_prefix = indices[0].item()`
			`else:`
			`longest_prefix = min_length`

			`if longest_prefix > 0:`
			`reset = False`
			`ex_cache.current_seq_len = longest_prefix`
			`if len(seq_tensor) - longest_prefix > 1:`
Exllamav2 lora support (#4229) --------- Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com> 2023-10-14 21:12:41 +02:00			`self.ex_model.forward(seq_tensor[longest_prefix:-1].view(1, -1), ex_cache, preprocess_only=True, loras=self.loras)`
Fix off-by-one error in exllama_hf caching logic (#4145) 2023-10-05 17:20:56 +02:00			`elif len(seq_tensor) == longest_prefix:`
			`# Very tricky: if the prefix we are reusing is the input_ids, then we have to back up the cache pointer by one,`
			`# because we feed input_ids[-1] to forward() below, but that last token is already in the cache!`
			`ex_cache.current_seq_len -= 1`
ExLlama_HF (v1 and v2) prefix matching 2023-09-19 22:12:19 +02:00
			`if reset:`
Add ExLlamaV2 and ExLlamav2_HF loaders (#3881) 2023-09-12 19:33:07 +02:00			`ex_cache.current_seq_len = 0`
ExLlama_HF (v1 and v2) prefix matching 2023-09-19 22:12:19 +02:00			`if len(seq_tensor) > 1:`
Exllamav2 lora support (#4229) --------- Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com> 2023-10-14 21:12:41 +02:00			`self.ex_model.forward(seq_tensor[:-1].view(1, -1), ex_cache, preprocess_only=True, loras=self.loras)`
Add ExLlamaV2 and ExLlamav2_HF loaders (#3881) 2023-09-12 19:33:07 +02:00
ExLlamav2_HF: Convert logits to FP32 (#4310) 2023-10-19 04:16:05 +02:00			`logits = self.ex_model.forward(seq_tensor[-1:].view(1, -1), ex_cache, loras=self.loras).to(input_ids.device).float()`
Add ExLlamaV2 and ExLlamav2_HF loaders (#3881) 2023-09-12 19:33:07 +02:00			`else:`
			`ex_cache.current_seq_len = 0`
ExLlamav2_HF: Convert logits to FP32 (#4310) 2023-10-19 04:16:05 +02:00			`logits = self.ex_model.forward(seq_tensor.view(1, -1), ex_cache, last_id_only=False, loras=self.loras).float()`
Add ExLlamaV2 and ExLlamav2_HF loaders (#3881) 2023-09-12 19:33:07 +02:00
			`if is_negative:`
			`self.past_seq_negative = seq_tensor`
			`else:`
			`self.past_seq = seq_tensor`

			`loss = None`
			`if labels is not None:`
			`# Shift so that tokens < n predict n`
			`shift_logits = logits[..., :-1, :].contiguous()`
			`shift_labels = labels[..., 1:].contiguous()`
			`# Flatten the tokens`
			`loss_fct = CrossEntropyLoss()`
			`shift_logits = shift_logits.view(-1, logits.shape[-1])`
			`shift_labels = shift_labels.view(-1)`
			`# Enable model parallelism`
			`shift_labels = shift_labels.to(shift_logits.device)`
			`loss = loss_fct(shift_logits, shift_labels)`

			`return CausalLMOutputWithPast(logits=logits, past_key_values=seq if use_cache else None, loss=loss)`

			`@classmethod`
			`def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], model_args, *kwargs):`
			`assert len(model_args) == 0 and len(kwargs) == 0, "extra args is currently not supported"`
			`if isinstance(pretrained_model_name_or_path, str):`
			`pretrained_model_name_or_path = Path(pretrained_model_name_or_path)`

			`pretrained_model_name_or_path = Path(f'{shared.args.model_dir}') / Path(pretrained_model_name_or_path)`

			`config = ExLlamaV2Config()`
Fix ExLlama-v2 path issue 2023-09-13 02:42:22 +02:00			`config.model_dir = str(pretrained_model_name_or_path)`
Add ExLlamaV2 and ExLlamav2_HF loaders (#3881) 2023-09-12 19:33:07 +02:00			`config.prepare()`
Add alpha_value/compress_pos_emb to ExLlama-v2 2023-09-13 00:02:47 +02:00
Add ExLlamaV2 and ExLlamav2_HF loaders (#3881) 2023-09-12 19:33:07 +02:00			`config.max_seq_len = shared.args.max_seq_len`
Fix NTK (alpha) and RoPE scaling for exllamav2 and exllamav2_HF (#3897) 2023-09-13 07:35:09 +02:00			`config.scale_pos_emb = shared.args.compress_pos_emb`
			`config.scale_alpha_value = shared.args.alpha_value`
Add ExLlamaV2 and ExLlamav2_HF loaders (#3881) 2023-09-12 19:33:07 +02:00
			`return Exllamav2HF(config)`