Remove RWKV loader (#5130)

2024-11-22 08:07:56 +01:00 · 2023-12-31 02:01:40 -03:00 · 2023-12-31 02:01:40 -03:00 · 2734ce3e4c
commit 2734ce3e4c
parent 0e54a09bcb
6 changed files with 3 additions and 189 deletions
--- a/README.md
+++ b/README.md
@ -312,13 +312,6 @@ List of command-line flags
 | `--nvme-offload-dir NVME_OFFLOAD_DIR` | DeepSpeed: Directory to use for ZeRO-3 NVME offloading. |
 | `--local_rank LOCAL_RANK`             | DeepSpeed: Optional argument for distributed setups. |
 #### RWKV
 | Flag                            | Description |
 |---------------------------------|-------------|
 | `--rwkv-strategy RWKV_STRATEGY` | RWKV: The strategy to use while loading the model. Examples: "cpu fp32", "cuda fp16", "cuda fp16i8". |
 | `--rwkv-cuda-on`                | RWKV: Compile the CUDA kernel for better performance. |
 #### RoPE (for llama.cpp, ExLlamaV2, and transformers)
 | Flag             | Description |
--- a/modules/RWKV.py
+++ b/modules/RWKV.py
@ -1,154 +0,0 @@
 '''
 This loader is not currently maintained as RWKV can now be loaded
 through the transformers library.
 '''
 import copy
 import os
 from pathlib import Path
 import numpy as np
 from tokenizers import Tokenizer
 from transformers import is_torch_xpu_available
 import modules.shared as shared
 from modules.callbacks import Iteratorize
 np.set_printoptions(precision=4, suppress=True, linewidth=200)
 os.environ['RWKV_JIT_ON'] = '1'
 os.environ["RWKV_CUDA_ON"] = '1' if shared.args.rwkv_cuda_on else '0'  # use CUDA kernel for seq mode (much faster)
 from rwkv.model import RWKV
 from rwkv.utils import PIPELINE, PIPELINE_ARGS
 class RWKVModel:
    def __init__(self):
        pass
    @classmethod
    def from_pretrained(self, path, dtype="bf16" if is_torch_xpu_available() else "fp16", device="xpu" if is_torch_xpu_available() else "cuda"):
        tokenizer_path = Path(f"{path.parent}/20B_tokenizer.json")
        if shared.args.rwkv_strategy is None:
            model = RWKV(model=str(path), strategy=f'{device} {dtype}')
        else:
            model = RWKV(model=str(path), strategy=shared.args.rwkv_strategy)
        pipeline = PIPELINE(model, str(tokenizer_path))
        result = self()
        result.pipeline = pipeline
        result.model = model
        result.cached_context = ""
        result.cached_model_state = None
        result.cached_output_logits = None
        return result
    def generate(self, prompt, state, callback=None):
        args = PIPELINE_ARGS(
            temperature=state['temperature'],
            top_p=state['top_p'],
            top_k=state['top_k'],
            alpha_frequency=0.1,  # Frequency Penalty (as in GPT-3)
            alpha_presence=0.1,  # Presence Penalty (as in GPT-3)
            token_ban=[0],  # ban the generation of some tokens
            token_stop=[]
        )
        if self.cached_context != "":
            if prompt.startswith(self.cached_context):
                prompt = prompt[len(self.cached_context):]
            else:
                self.cached_context = ""
                self.cached_model_state = None
                self.cached_output_logits = None
        # out = self.pipeline.generate(prompt, token_count=state['max_new_tokens'], args=args, callback=callback)
        out = self.generate_from_cached_state(prompt, token_count=state['max_new_tokens'], args=args, callback=callback)
        return out
    def generate_with_streaming(self, *args, **kwargs):
        with Iteratorize(self.generate, args, kwargs, callback=None) as generator:
            reply = ''
            for token in generator:
                reply += token
                yield reply
    # Similar to the PIPELINE.generate, but lets us maintain the cached_model_state
    def generate_from_cached_state(self, ctx="", token_count=20, args=None, callback=None):
        all_tokens = []
        out_str = ''
        occurrence = {}
        state = copy.deepcopy(self.cached_model_state) if self.cached_model_state is not None else None
        # if we ended up with an empty context, just reuse the cached logits
        # this can happen if a user undoes a message and then sends the exact message again
        # in that case the full context ends up being the same as the cached_context, so the remaining context is empty.
        if ctx == "":
            out = self.cached_output_logits
        token = None
        for i in range(token_count):
            # forward
            tokens = self.pipeline.encode(ctx) if i == 0 else [token]
            while len(tokens) > 0:
                out, state = self.model.forward(tokens[:args.chunk_len], state)
                tokens = tokens[args.chunk_len:]
            if i == 0:
                begin_token = len(all_tokens)
                last_token_posi = begin_token
            # cache the model state after scanning the context
            # we don't cache the state after processing our own generated tokens because
            # the output string might be post-processed arbitrarily. Therefore, what's fed into the model
            # on the next round of chat might be slightly different what what it output on the previous round
            if i == 0:
                self.cached_context += ctx
                self.cached_model_state = copy.deepcopy(state)
                self.cached_output_logits = copy.deepcopy(out)
            # adjust probabilities
            for n in args.token_ban:
                out[n] = -float('inf')
            for n in occurrence:
                out[n] -= (args.alpha_presence + occurrence[n] * args.alpha_frequency)
            # sampler
            token = self.pipeline.sample_logits(out, temperature=args.temperature, top_p=args.top_p, top_k=args.top_k)
            if token in args.token_stop:
                break
            all_tokens += [token]
            if token not in occurrence:
                occurrence[token] = 1
            else:
                occurrence[token] += 1
            # output
            tmp = self.pipeline.decode(all_tokens[last_token_posi:])
            if '\ufffd' not in tmp:  # is valid utf-8 string?
                if callback:
                    callback(tmp)
                out_str += tmp
                last_token_posi = begin_token + i + 1
        return out_str
 class RWKVTokenizer:
    def __init__(self):
        pass
    @classmethod
    def from_pretrained(self, path):
        tokenizer_path = path / "20B_tokenizer.json"
        tokenizer = Tokenizer.from_file(str(tokenizer_path))
        result = self()
        result.tokenizer = tokenizer
        return result
    def encode(self, prompt):
        return self.tokenizer.encode(prompt).ids
    def decode(self, ids):
        return self.tokenizer.decode(ids)
--- a/modules/models.py
+++ b/modules/models.py
@ -65,7 +65,6 @@ def load_model(model_name, loader=None):
        'GPTQ-for-LLaMa': GPTQ_loader,
        'llama.cpp': llamacpp_loader,
        'llamacpp_HF': llamacpp_HF_loader,
        'RWKV': RWKV_loader,
        'ExLlamav2': ExLlamav2_loader,
        'ExLlamav2_HF': ExLlamav2_HF_loader,
        'ctransformers': ctransformers_loader,
@ -405,23 +404,6 @@ def HQQ_loader(model_name):
    return model
 def RWKV_loader(model_name):
    '''
    This loader is not currently maintained as RWKV can now be loaded
    through the transformers library.
    '''
    from modules.RWKV import RWKVModel, RWKVTokenizer
    model = RWKVModel.from_pretrained(
        Path(f'{shared.args.model_dir}/{model_name}'),
        dtype="fp32" if shared.args.cpu else "bf16" if shared.args.bf16 else "fp16",
        device="cpu" if shared.args.cpu else "xpu" if is_xpu_available() else "cuda"
    )
    tokenizer = RWKVTokenizer.from_pretrained(Path(shared.args.model_dir))
    return model, tokenizer
 def get_max_memory_dict():
    max_memory = {}
    max_cpu_memory = shared.args.cpu_memory.strip() if shared.args.cpu_memory is not None else '99GiB'
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@ -157,8 +157,6 @@ def infer_loader(model_name, model_settings):
        loader = 'llama.cpp'
    elif re.match(r'.*\.gguf', model_name.lower()):
        loader = 'llama.cpp'
    elif re.match(r'.*rwkv.*\.pth', model_name.lower()):
        loader = 'RWKV'
    elif re.match(r'.*exl2', model_name.lower()):
        loader = 'ExLlamav2_HF'
    elif re.match(r'.*-hqq', model_name.lower()):
--- a/modules/shared.py
+++ b/modules/shared.py
@ -165,11 +165,6 @@ group.add_argument('--deepspeed', action='store_true', help='Enable the use of D
 group.add_argument('--nvme-offload-dir', type=str, help='DeepSpeed: Directory to use for ZeRO-3 NVME offloading.')
 group.add_argument('--local_rank', type=int, default=0, help='DeepSpeed: Optional argument for distributed setups.')
 # RWKV
 group = parser.add_argument_group('RWKV')
 group.add_argument('--rwkv-strategy', type=str, default=None, help='RWKV: The strategy to use while loading the model. Examples: "cpu fp32", "cuda fp16", "cuda fp16i8".')
 group.add_argument('--rwkv-cuda-on', action='store_true', help='RWKV: Compile the CUDA kernel for better performance.')
 # RoPE
 group = parser.add_argument_group('RoPE')
 group.add_argument('--alpha_value', type=float, default=1, help='Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.')
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@ -44,7 +44,7 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
            yield ''
            return
-        if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'Exllamav2Model', 'CtransformersModel']:
+        if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'CtransformersModel']:
            generate_func = generate_reply_custom
        else:
            generate_func = generate_reply_HF
@ -118,7 +118,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
    if shared.tokenizer is None:
        raise ValueError('No tokenizer is loaded')
-    if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'CtransformersModel', 'Exllamav2Model']:
+    if shared.model.__class__.__name__ in ['LlamaCppModel', 'CtransformersModel', 'Exllamav2Model']:
        input_ids = shared.tokenizer.encode(str(prompt))
        if shared.model.__class__.__name__ not in ['Exllamav2Model']:
            input_ids = np.array(input_ids).reshape(1, len(input_ids))
@ -132,7 +132,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
    if truncation_length is not None:
        input_ids = input_ids[:, -truncation_length:]
-    if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'Exllamav2Model', 'CtransformersModel'] or shared.args.cpu:
+    if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'CtransformersModel'] or shared.args.cpu:
        return input_ids
    elif shared.args.deepspeed:
        return input_ids.to(device=local_rank)