diff --git a/modules/LLaMA.py b/modules/LLaMA.py deleted file mode 100644 index 3781ccf5..00000000 --- a/modules/LLaMA.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# This software may be used and distributed according to the terms of the GNU General Public License version 3. - -import json -import os -import sys -import time -from pathlib import Path -from typing import Tuple - -import fire -import torch -from fairscale.nn.model_parallel.initialize import initialize_model_parallel -from llama import LLaMA, ModelArgs, Tokenizer, Transformer - -os.environ['RANK'] = '0' -os.environ['WORLD_SIZE'] = '1' -os.environ['MP'] = '1' -os.environ['MASTER_ADDR'] = '127.0.0.1' -os.environ['MASTER_PORT'] = '2223' - -def setup_model_parallel() -> Tuple[int, int]: - local_rank = int(os.environ.get("LOCAL_RANK", -1)) - world_size = int(os.environ.get("WORLD_SIZE", -1)) - - torch.distributed.init_process_group("gloo") - initialize_model_parallel(world_size) - torch.cuda.set_device(local_rank) - - # seed must be the same in all processes - torch.manual_seed(1) - return local_rank, world_size - -def load( - ckpt_dir: str, - tokenizer_path: str, - local_rank: int, - world_size: int, - max_seq_len: int, - max_batch_size: int, -) -> LLaMA: - start_time = time.time() - checkpoints = sorted(Path(ckpt_dir).glob("*.pth")) - assert world_size == len( - checkpoints - ), f"Loading a checkpoint for MP={len(checkpoints)} but world size is {world_size}" - ckpt_path = checkpoints[local_rank] - print("Loading") - checkpoint = torch.load(ckpt_path, map_location="cpu") - with open(Path(ckpt_dir) / "params.json", "r") as f: - params = json.loads(f.read()) - - model_args: ModelArgs = ModelArgs( - max_seq_len=max_seq_len, max_batch_size=max_batch_size, **params - ) - tokenizer = Tokenizer(model_path=tokenizer_path) - model_args.vocab_size = tokenizer.n_words - torch.set_default_tensor_type(torch.cuda.HalfTensor) - model = Transformer(model_args) - torch.set_default_tensor_type(torch.FloatTensor) - model.load_state_dict(checkpoint, strict=False) - - generator = LLaMA(model, tokenizer) - print(f"Loaded in {time.time() - start_time:.2f} seconds") - return generator - - -class LLaMAModel: - def __init__(self): - pass - - @classmethod - def from_pretrained(self, path, max_seq_len=2048, max_batch_size=1): - tokenizer_path = path / "tokenizer.model" - path = os.path.abspath(path) - tokenizer_path = os.path.abspath(tokenizer_path) - - local_rank, world_size = setup_model_parallel() - if local_rank > 0: - sys.stdout = open(os.devnull, "w") - - generator = load( - path, tokenizer_path, local_rank, world_size, max_seq_len, max_batch_size - ) - - result = self() - result.pipeline = generator - return result - - def generate(self, prompt, token_count=512, temperature=0.8, top_p=0.95): - - results = self.pipeline.generate( - [prompt], max_gen_len=token_count, temperature=temperature, top_p=top_p - ) - - return results[0] diff --git a/modules/LLaMA_8bit.py b/modules/LLaMA_8bit.py deleted file mode 100644 index a339277c..00000000 --- a/modules/LLaMA_8bit.py +++ /dev/null @@ -1,125 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# This software may be used and distributed according to the terms of the GNU General Public License version 3. - -from typing import Tuple -import os -import sys -import torch -import fire -import time -import json - -from pathlib import Path - -from fairscale.nn.model_parallel.initialize import initialize_model_parallel - -from repositories.llama_int8.llama import ModelArgs, Transformer, Tokenizer, LLaMA - - -def setup_model_parallel() -> Tuple[int, int]: - local_rank = int(os.environ.get("LOCAL_RANK", -1)) - world_size = int(os.environ.get("WORLD_SIZE", -1)) - - torch.distributed.init_process_group("nccl") - initialize_model_parallel(world_size) - torch.cuda.set_device(local_rank) - - # seed must be the same in all processes - torch.manual_seed(1) - return local_rank, world_size - - -def load( - ckpt_dir: str, - tokenizer_path: str, - max_seq_len: int, - max_batch_size: int, -) -> LLaMA: - start_time = time.time() - checkpoints = sorted(Path(ckpt_dir).glob("*.pth")) - - with open(Path(ckpt_dir) / "params.json", "r") as f: - params = json.loads(f.read()) - - model_args: ModelArgs = ModelArgs( - max_seq_len=max_seq_len, max_batch_size=max_batch_size, **params - ) - tokenizer = Tokenizer(model_path=tokenizer_path) - model_args.vocab_size = tokenizer.n_words - # torch.set_default_tensor_type(torch.cuda.HalfTensor) - torch.set_default_tensor_type(torch.HalfTensor) - print("Creating transformer") - model = Transformer(model_args) - print("Transformer created") - - key_to_dim = { - "w1": 0, - "w2": -1, - "w3": 0, - "wo": -1, - "wq": 0, - "wk": 0, - "wv": 0, - "output": 0, - "tok_embeddings": -1, - "ffn_norm": None, - "attention_norm": None, - "norm": None, - "rope": None, - } - - # ? - torch.set_default_tensor_type(torch.FloatTensor) - - # load the state dict incrementally, to avoid memory problems - for i, ckpt in enumerate(checkpoints): - print(f"Loading checkpoint {i}") - checkpoint = torch.load(ckpt, map_location="cpu") - for parameter_name, parameter in model.named_parameters(): - short_name = parameter_name.split(".")[-2] - if key_to_dim[short_name] is None and i == 0: - parameter.data = checkpoint[parameter_name] - elif key_to_dim[short_name] == 0: - size = checkpoint[parameter_name].size(0) - parameter.data[size * i : size * (i + 1), :] = checkpoint[ - parameter_name - ] - elif key_to_dim[short_name] == -1: - size = checkpoint[parameter_name].size(-1) - parameter.data[:, size * i : size * (i + 1)] = checkpoint[ - parameter_name - ] - del checkpoint - - # model.load_state_dict(checkpoint, strict=False) - model.quantize() - - generator = LLaMA(model, tokenizer) - print(f"Loaded in {time.time() - start_time:.2f} seconds") - return generator - - -class LLaMAModel_8bit: - def __init__(self): - pass - - @classmethod - def from_pretrained(self, path, max_seq_len=2048, max_batch_size=1): - tokenizer_path = path / "tokenizer.model" - path = os.path.abspath(path) - tokenizer_path = os.path.abspath(tokenizer_path) - - generator = load(path, tokenizer_path, max_seq_len, max_batch_size) - - result = self() - result.pipeline = generator - return result - - def generate(self, prompt, token_count=512, temperature=0.8, top_p=0.95): - - results = self.pipeline.generate( - [prompt], max_gen_len=token_count, temperature=temperature, top_p=top_p - ) - - return results[0] - diff --git a/modules/models.py b/modules/models.py index c7b75bb9..40feb8b3 100644 --- a/modules/models.py +++ b/modules/models.py @@ -39,10 +39,9 @@ def load_model(model_name): t0 = time.time() shared.is_RWKV = model_name.lower().startswith('rwkv-') - shared.is_LLaMA = model_name.lower().startswith('llama-') # Default settings - if not (shared.args.cpu or shared.args.load_in_8bit or shared.args.auto_devices or shared.args.disk or shared.args.gpu_memory is not None or shared.args.cpu_memory is not None or shared.args.deepspeed or shared.args.flexgen or shared.is_RWKV or shared.is_LLaMA): + if not (shared.args.cpu or shared.args.load_in_8bit or shared.args.auto_devices or shared.args.disk or shared.args.gpu_memory is not None or shared.args.cpu_memory is not None or shared.args.deepspeed or shared.args.flexgen or shared.is_RWKV): if any(size in shared.model_name.lower() for size in ('13b', '20b', '30b')): model = AutoModelForCausalLM.from_pretrained(Path(f"models/{shared.model_name}"), device_map='auto', load_in_8bit=True) else: @@ -86,23 +85,6 @@ def load_model(model_name): return model, None - # LLaMA model (not on HuggingFace) - elif shared.is_LLaMA: - if shared.args.load_in_8bit: - import modules.LLaMA_8bit - from modules.LLaMA_8bit import LLaMAModel_8bit - - model = LLaMAModel_8bit.from_pretrained(Path(f'models/{model_name}')) - - return model, None - else: - import modules.LLaMA - from modules.LLaMA import LLaMAModel - - model = LLaMAModel.from_pretrained(Path(f'models/{model_name}')) - - return model, None - # Custom else: command = "AutoModelForCausalLM.from_pretrained" diff --git a/modules/shared.py b/modules/shared.py index e9dfdaa2..29276fd3 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -6,7 +6,6 @@ model_name = "" soft_prompt_tensor = None soft_prompt = False is_RWKV = False -is_LLaMA = False # Chat variables history = {'internal': [], 'visible': []} @@ -43,7 +42,6 @@ settings = { 'default': 'NovelAI-Sphinx Moth', 'pygmalion-*': 'Pygmalion', 'RWKV-*': 'Naive', - 'llama-*': 'Naive', '(rosey|chip|joi)_.*_instruct.*': 'Instruct Joi (Contrastive Search)' }, 'prompts': { diff --git a/modules/text_generation.py b/modules/text_generation.py index f9082a31..ee93fb7c 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -24,7 +24,7 @@ def encode(prompt, tokens_to_generate=0, add_special_tokens=True): # These models do not have explicit tokenizers for now, so # we return an estimate for the number of tokens - if shared.is_RWKV or shared.is_LLaMA: + if shared.is_RWKV: return np.zeros((1, len(prompt)//4)) input_ids = shared.tokenizer.encode(str(prompt), return_tensors='pt', truncation=True, max_length=get_max_prompt_length(tokens_to_generate), add_special_tokens=add_special_tokens) @@ -90,7 +90,7 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi # These models are not part of Hugging Face, so we handle them # separately and terminate the function call earlier - if shared.is_RWKV or shared.is_LLaMA: + if shared.is_RWKV: if shared.args.no_stream: reply = shared.model.generate(question, token_count=max_new_tokens, temperature=temperature, top_p=top_p) t1 = time.time() diff --git a/requirements.txt b/requirements.txt index 55aeb8fd..70dc8349 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,4 @@ gradio==3.18.0 numpy rwkv==0.0.6 safetensors==0.2.8 -git+https://github.com/huggingface/transformers +git+https://github.com/oobabooga/transformers@llama_push