2023-02-23 17:28:30 +01:00
import json
import os
2023-03-09 19:50:26 +01:00
import sys
2023-02-23 17:28:30 +01:00
import time
import zipfile
from pathlib import Path
import numpy as np
import torch
2023-02-23 17:42:23 +01:00
import transformers
2023-02-23 18:41:42 +01:00
from transformers import AutoModelForCausalLM , AutoTokenizer
import modules . shared as shared
2023-02-23 17:28:30 +01:00
2023-02-23 17:42:23 +01:00
transformers . logging . set_verbosity_error ( )
2023-02-23 17:28:30 +01:00
local_rank = None
if shared . args . flexgen :
2023-02-26 20:53:41 +01:00
from flexgen . flex_opt import ( CompressionConfig , ExecutionEnv , OptLM ,
Policy , str2bool )
2023-02-23 17:28:30 +01:00
if shared . args . deepspeed :
import deepspeed
2023-02-23 18:41:42 +01:00
from transformers . deepspeed import ( HfDeepSpeedConfig ,
is_deepspeed_zero3_enabled )
2023-02-23 17:28:30 +01:00
from modules . deepspeed_parameters import generate_ds_config
# Distributed setup
local_rank = shared . args . local_rank if shared . args . local_rank is not None else int ( os . getenv ( " LOCAL_RANK " , " 0 " ) )
world_size = int ( os . getenv ( " WORLD_SIZE " , " 1 " ) )
torch . cuda . set_device ( local_rank )
deepspeed . init_distributed ( )
ds_config = generate_ds_config ( shared . args . bf16 , 1 * world_size , shared . args . nvme_offload_dir )
dschf = HfDeepSpeedConfig ( ds_config ) # Keep this object alive for the Transformers integration
def load_model ( model_name ) :
print ( f " Loading { model_name } ... " )
t0 = time . time ( )
2023-02-28 03:03:35 +01:00
shared . is_RWKV = model_name . lower ( ) . startswith ( ' rwkv- ' )
2023-02-23 17:28:30 +01:00
# Default settings
2023-03-12 15:12:34 +01:00
if not any ( [ shared . args . cpu , shared . args . load_in_8bit , shared . args . load_in_4bit , shared . args . llama_bits > 0 , shared . args . auto_devices , shared . args . disk , shared . args . gpu_memory is not None , shared . args . cpu_memory is not None , shared . args . deepspeed , shared . args . flexgen , shared . is_RWKV ] ) :
2023-02-23 17:28:30 +01:00
if any ( size in shared . model_name . lower ( ) for size in ( ' 13b ' , ' 20b ' , ' 30b ' ) ) :
model = AutoModelForCausalLM . from_pretrained ( Path ( f " models/ { shared . model_name } " ) , device_map = ' auto ' , load_in_8bit = True )
else :
model = AutoModelForCausalLM . from_pretrained ( Path ( f " models/ { shared . model_name } " ) , low_cpu_mem_usage = True , torch_dtype = torch . bfloat16 if shared . args . bf16 else torch . float16 ) . cuda ( )
# FlexGen
elif shared . args . flexgen :
2023-02-26 20:53:41 +01:00
# Initialize environment
env = ExecutionEnv . create ( shared . args . disk_cache_dir )
2023-02-23 17:28:30 +01:00
# Offloading policy
policy = Policy ( 1 , 1 ,
shared . args . percent [ 0 ] , shared . args . percent [ 1 ] ,
shared . args . percent [ 2 ] , shared . args . percent [ 3 ] ,
shared . args . percent [ 4 ] , shared . args . percent [ 5 ] ,
2023-03-04 05:04:02 +01:00
overlap = True , sep_layer = True , pin_weight = shared . args . pin_weight ,
2023-02-23 17:28:30 +01:00
cpu_cache_compute = False , attn_sparsity = 1.0 ,
compress_weight = shared . args . compress_weight ,
comp_weight_config = CompressionConfig (
num_bits = 4 , group_size = 64 ,
group_dim = 0 , symmetric = False ) ,
compress_cache = False ,
comp_cache_config = CompressionConfig (
num_bits = 4 , group_size = 64 ,
group_dim = 2 , symmetric = False ) )
2023-02-26 20:53:41 +01:00
model = OptLM ( f " facebook/ { shared . model_name } " , env , " models " , policy )
2023-02-23 17:28:30 +01:00
# DeepSpeed ZeRO-3
elif shared . args . deepspeed :
model = AutoModelForCausalLM . from_pretrained ( Path ( f " models/ { shared . model_name } " ) , torch_dtype = torch . bfloat16 if shared . args . bf16 else torch . float16 )
model = deepspeed . initialize ( model = model , config_params = ds_config , model_parameters = None , optimizer = None , lr_scheduler = None ) [ 0 ]
model . module . eval ( ) # Inference
print ( f " DeepSpeed ZeRO-3 is enabled: { is_deepspeed_zero3_enabled ( ) } " )
2023-02-28 03:03:35 +01:00
# RMKV model (not on HuggingFace)
elif shared . is_RWKV :
2023-03-06 12:45:49 +01:00
from modules . RWKV import RWKVModel , RWKVTokenizer
2023-02-28 03:03:35 +01:00
2023-03-01 16:08:55 +01:00
model = RWKVModel . from_pretrained ( Path ( f ' models/ { model_name } ' ) , dtype = " fp32 " if shared . args . cpu else " bf16 " if shared . args . bf16 else " fp16 " , device = " cpu " if shared . args . cpu else " cuda " )
2023-03-06 12:45:49 +01:00
tokenizer = RWKVTokenizer . from_pretrained ( Path ( ' models ' ) )
2023-03-01 16:08:55 +01:00
2023-03-06 12:45:49 +01:00
return model , tokenizer
2023-02-28 03:03:35 +01:00
2023-03-09 19:50:26 +01:00
# 4-bit LLaMA
2023-03-12 15:12:34 +01:00
elif shared . args . llama_bits > 0 or shared . args . load_in_4bit :
from modules . quantized_LLaMA import load_quantized_LLaMA
2023-03-10 13:29:09 +01:00
2023-03-12 15:12:34 +01:00
model = load_quantized_LLaMA ( model_name )
2023-03-09 19:50:26 +01:00
2023-02-23 17:28:30 +01:00
# Custom
else :
command = " AutoModelForCausalLM.from_pretrained "
params = [ " low_cpu_mem_usage=True " ]
if not shared . args . cpu and not torch . cuda . is_available ( ) :
print ( " Warning: no GPU has been detected. \n Falling back to CPU mode. \n " )
shared . args . cpu = True
if shared . args . cpu :
params . append ( " low_cpu_mem_usage=True " )
params . append ( " torch_dtype=torch.float32 " )
else :
params . append ( " device_map= ' auto ' " )
params . append ( " load_in_8bit=True " if shared . args . load_in_8bit else " torch_dtype=torch.bfloat16 " if shared . args . bf16 else " torch_dtype=torch.float16 " )
if shared . args . gpu_memory :
2023-02-24 12:55:09 +01:00
memory_map = shared . args . gpu_memory
2023-02-24 00:43:55 +01:00
max_memory = f " max_memory= {{ 0: ' { memory_map [ 0 ] } GiB ' "
2023-02-24 12:55:09 +01:00
for i in range ( 1 , len ( memory_map ) ) :
max_memory + = ( f " , { i } : ' { memory_map [ i ] } GiB ' " )
max_memory + = ( f " , ' cpu ' : ' { shared . args . cpu_memory or ' 99 ' } GiB ' }} " )
2023-02-24 00:43:55 +01:00
params . append ( max_memory )
2023-02-23 17:28:30 +01:00
elif not shared . args . load_in_8bit :
total_mem = ( torch . cuda . get_device_properties ( 0 ) . total_memory / ( 1024 * 1024 ) )
suggestion = round ( ( total_mem - 1000 ) / 1000 ) * 1000
if total_mem - suggestion < 800 :
suggestion - = 1000
suggestion = int ( round ( suggestion / 1000 ) )
print ( f " \033 [1;32;1mAuto-assiging --gpu-memory { suggestion } for your GPU to try to prevent out-of-memory errors. \n You can manually set other values. \033 [0;37;0m " )
params . append ( f " max_memory= {{ 0: ' { suggestion } GiB ' , ' cpu ' : ' { shared . args . cpu_memory or ' 99 ' } GiB ' }} " )
if shared . args . disk :
params . append ( f " offload_folder= ' { shared . args . disk_cache_dir } ' " )
command = f " { command } (Path(f ' models/ { shared . model_name } ' ), { ' , ' . join ( set ( params ) ) } ) "
model = eval ( command )
# Loading the tokenizer
2023-02-24 20:46:50 +01:00
if shared . model_name . lower ( ) . startswith ( ( ' gpt4chan ' , ' gpt-4chan ' , ' 4chan ' ) ) and Path ( " models/gpt-j-6B/ " ) . exists ( ) :
2023-02-23 17:28:30 +01:00
tokenizer = AutoTokenizer . from_pretrained ( Path ( " models/gpt-j-6B/ " ) )
else :
tokenizer = AutoTokenizer . from_pretrained ( Path ( f " models/ { shared . model_name } / " ) )
tokenizer . truncation_side = ' left '
print ( f " Loaded the model in { ( time . time ( ) - t0 ) : .2f } seconds. " )
return model , tokenizer
def load_soft_prompt ( name ) :
if name == ' None ' :
shared . soft_prompt = False
shared . soft_prompt_tensor = None
else :
with zipfile . ZipFile ( Path ( f ' softprompts/ { name } .zip ' ) ) as zf :
zf . extract ( ' tensor.npy ' )
zf . extract ( ' meta.json ' )
j = json . loads ( open ( ' meta.json ' , ' r ' ) . read ( ) )
print ( f " \n Loading the softprompt \" { name } \" . " )
for field in j :
if field != ' name ' :
if type ( j [ field ] ) is list :
print ( f " { field } : { ' , ' . join ( j [ field ] ) } " )
else :
print ( f " { field } : { j [ field ] } " )
print ( )
tensor = np . load ( ' tensor.npy ' )
Path ( ' tensor.npy ' ) . unlink ( )
Path ( ' meta.json ' ) . unlink ( )
tensor = torch . Tensor ( tensor ) . to ( device = shared . model . device , dtype = shared . model . dtype )
tensor = torch . reshape ( tensor , ( 1 , tensor . shape [ 0 ] , tensor . shape [ 1 ] ) )
shared . soft_prompt = True
shared . soft_prompt_tensor = tensor
return name