Remove RWKV loader (#5130)

This commit is contained in:
oobabooga 2023-12-31 02:01:40 -03:00 committed by GitHub
parent 0e54a09bcb
commit 2734ce3e4c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 3 additions and 189 deletions

View File

@ -312,13 +312,6 @@ List of command-line flags
| `--nvme-offload-dir NVME_OFFLOAD_DIR` | DeepSpeed: Directory to use for ZeRO-3 NVME offloading. |
| `--local_rank LOCAL_RANK` | DeepSpeed: Optional argument for distributed setups. |
#### RWKV
| Flag | Description |
|---------------------------------|-------------|
| `--rwkv-strategy RWKV_STRATEGY` | RWKV: The strategy to use while loading the model. Examples: "cpu fp32", "cuda fp16", "cuda fp16i8". |
| `--rwkv-cuda-on` | RWKV: Compile the CUDA kernel for better performance. |
#### RoPE (for llama.cpp, ExLlamaV2, and transformers)
| Flag | Description |

View File

@ -1,154 +0,0 @@
'''
This loader is not currently maintained as RWKV can now be loaded
through the transformers library.
'''
import copy
import os
from pathlib import Path
import numpy as np
from tokenizers import Tokenizer
from transformers import is_torch_xpu_available
import modules.shared as shared
from modules.callbacks import Iteratorize
np.set_printoptions(precision=4, suppress=True, linewidth=200)
os.environ['RWKV_JIT_ON'] = '1'
os.environ["RWKV_CUDA_ON"] = '1' if shared.args.rwkv_cuda_on else '0' # use CUDA kernel for seq mode (much faster)
from rwkv.model import RWKV
from rwkv.utils import PIPELINE, PIPELINE_ARGS
class RWKVModel:
def __init__(self):
pass
@classmethod
def from_pretrained(self, path, dtype="bf16" if is_torch_xpu_available() else "fp16", device="xpu" if is_torch_xpu_available() else "cuda"):
tokenizer_path = Path(f"{path.parent}/20B_tokenizer.json")
if shared.args.rwkv_strategy is None:
model = RWKV(model=str(path), strategy=f'{device} {dtype}')
else:
model = RWKV(model=str(path), strategy=shared.args.rwkv_strategy)
pipeline = PIPELINE(model, str(tokenizer_path))
result = self()
result.pipeline = pipeline
result.model = model
result.cached_context = ""
result.cached_model_state = None
result.cached_output_logits = None
return result
def generate(self, prompt, state, callback=None):
args = PIPELINE_ARGS(
temperature=state['temperature'],
top_p=state['top_p'],
top_k=state['top_k'],
alpha_frequency=0.1, # Frequency Penalty (as in GPT-3)
alpha_presence=0.1, # Presence Penalty (as in GPT-3)
token_ban=[0], # ban the generation of some tokens
token_stop=[]
)
if self.cached_context != "":
if prompt.startswith(self.cached_context):
prompt = prompt[len(self.cached_context):]
else:
self.cached_context = ""
self.cached_model_state = None
self.cached_output_logits = None
# out = self.pipeline.generate(prompt, token_count=state['max_new_tokens'], args=args, callback=callback)
out = self.generate_from_cached_state(prompt, token_count=state['max_new_tokens'], args=args, callback=callback)
return out
def generate_with_streaming(self, *args, **kwargs):
with Iteratorize(self.generate, args, kwargs, callback=None) as generator:
reply = ''
for token in generator:
reply += token
yield reply
# Similar to the PIPELINE.generate, but lets us maintain the cached_model_state
def generate_from_cached_state(self, ctx="", token_count=20, args=None, callback=None):
all_tokens = []
out_str = ''
occurrence = {}
state = copy.deepcopy(self.cached_model_state) if self.cached_model_state is not None else None
# if we ended up with an empty context, just reuse the cached logits
# this can happen if a user undoes a message and then sends the exact message again
# in that case the full context ends up being the same as the cached_context, so the remaining context is empty.
if ctx == "":
out = self.cached_output_logits
token = None
for i in range(token_count):
# forward
tokens = self.pipeline.encode(ctx) if i == 0 else [token]
while len(tokens) > 0:
out, state = self.model.forward(tokens[:args.chunk_len], state)
tokens = tokens[args.chunk_len:]
if i == 0:
begin_token = len(all_tokens)
last_token_posi = begin_token
# cache the model state after scanning the context
# we don't cache the state after processing our own generated tokens because
# the output string might be post-processed arbitrarily. Therefore, what's fed into the model
# on the next round of chat might be slightly different what what it output on the previous round
if i == 0:
self.cached_context += ctx
self.cached_model_state = copy.deepcopy(state)
self.cached_output_logits = copy.deepcopy(out)
# adjust probabilities
for n in args.token_ban:
out[n] = -float('inf')
for n in occurrence:
out[n] -= (args.alpha_presence + occurrence[n] * args.alpha_frequency)
# sampler
token = self.pipeline.sample_logits(out, temperature=args.temperature, top_p=args.top_p, top_k=args.top_k)
if token in args.token_stop:
break
all_tokens += [token]
if token not in occurrence:
occurrence[token] = 1
else:
occurrence[token] += 1
# output
tmp = self.pipeline.decode(all_tokens[last_token_posi:])
if '\ufffd' not in tmp: # is valid utf-8 string?
if callback:
callback(tmp)
out_str += tmp
last_token_posi = begin_token + i + 1
return out_str
class RWKVTokenizer:
def __init__(self):
pass
@classmethod
def from_pretrained(self, path):
tokenizer_path = path / "20B_tokenizer.json"
tokenizer = Tokenizer.from_file(str(tokenizer_path))
result = self()
result.tokenizer = tokenizer
return result
def encode(self, prompt):
return self.tokenizer.encode(prompt).ids
def decode(self, ids):
return self.tokenizer.decode(ids)

View File

@ -65,7 +65,6 @@ def load_model(model_name, loader=None):
'GPTQ-for-LLaMa': GPTQ_loader,
'llama.cpp': llamacpp_loader,
'llamacpp_HF': llamacpp_HF_loader,
'RWKV': RWKV_loader,
'ExLlamav2': ExLlamav2_loader,
'ExLlamav2_HF': ExLlamav2_HF_loader,
'ctransformers': ctransformers_loader,
@ -405,23 +404,6 @@ def HQQ_loader(model_name):
return model
def RWKV_loader(model_name):
'''
This loader is not currently maintained as RWKV can now be loaded
through the transformers library.
'''
from modules.RWKV import RWKVModel, RWKVTokenizer
model = RWKVModel.from_pretrained(
Path(f'{shared.args.model_dir}/{model_name}'),
dtype="fp32" if shared.args.cpu else "bf16" if shared.args.bf16 else "fp16",
device="cpu" if shared.args.cpu else "xpu" if is_xpu_available() else "cuda"
)
tokenizer = RWKVTokenizer.from_pretrained(Path(shared.args.model_dir))
return model, tokenizer
def get_max_memory_dict():
max_memory = {}
max_cpu_memory = shared.args.cpu_memory.strip() if shared.args.cpu_memory is not None else '99GiB'

View File

@ -157,8 +157,6 @@ def infer_loader(model_name, model_settings):
loader = 'llama.cpp'
elif re.match(r'.*\.gguf', model_name.lower()):
loader = 'llama.cpp'
elif re.match(r'.*rwkv.*\.pth', model_name.lower()):
loader = 'RWKV'
elif re.match(r'.*exl2', model_name.lower()):
loader = 'ExLlamav2_HF'
elif re.match(r'.*-hqq', model_name.lower()):

View File

@ -165,11 +165,6 @@ group.add_argument('--deepspeed', action='store_true', help='Enable the use of D
group.add_argument('--nvme-offload-dir', type=str, help='DeepSpeed: Directory to use for ZeRO-3 NVME offloading.')
group.add_argument('--local_rank', type=int, default=0, help='DeepSpeed: Optional argument for distributed setups.')
# RWKV
group = parser.add_argument_group('RWKV')
group.add_argument('--rwkv-strategy', type=str, default=None, help='RWKV: The strategy to use while loading the model. Examples: "cpu fp32", "cuda fp16", "cuda fp16i8".')
group.add_argument('--rwkv-cuda-on', action='store_true', help='RWKV: Compile the CUDA kernel for better performance.')
# RoPE
group = parser.add_argument_group('RoPE')
group.add_argument('--alpha_value', type=float, default=1, help='Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.')

View File

@ -44,7 +44,7 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
yield ''
return
if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'Exllamav2Model', 'CtransformersModel']:
if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'CtransformersModel']:
generate_func = generate_reply_custom
else:
generate_func = generate_reply_HF
@ -118,7 +118,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
if shared.tokenizer is None:
raise ValueError('No tokenizer is loaded')
if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'CtransformersModel', 'Exllamav2Model']:
if shared.model.__class__.__name__ in ['LlamaCppModel', 'CtransformersModel', 'Exllamav2Model']:
input_ids = shared.tokenizer.encode(str(prompt))
if shared.model.__class__.__name__ not in ['Exllamav2Model']:
input_ids = np.array(input_ids).reshape(1, len(input_ids))
@ -132,7 +132,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
if truncation_length is not None:
input_ids = input_ids[:, -truncation_length:]
if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'Exllamav2Model', 'CtransformersModel'] or shared.args.cpu:
if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'CtransformersModel'] or shared.args.cpu:
return input_ids
elif shared.args.deepspeed:
return input_ids.to(device=local_rank)