2023-02-23 17:28:30 +01:00
import json
import os
import time
import zipfile
from pathlib import Path
import numpy as np
import torch
2023-02-23 17:42:23 +01:00
import transformers
2023-03-16 17:34:23 +01:00
from accelerate import infer_auto_device_map , init_empty_weights
from transformers import ( AutoConfig , AutoModelForCausalLM , AutoTokenizer ,
BitsAndBytesConfig )
2023-02-23 18:41:42 +01:00
import modules . shared as shared
2023-02-23 17:28:30 +01:00
2023-02-23 17:42:23 +01:00
transformers . logging . set_verbosity_error ( )
2023-02-23 17:28:30 +01:00
local_rank = None
if shared . args . flexgen :
2023-03-16 14:18:34 +01:00
from flexgen . flex_opt import CompressionConfig , ExecutionEnv , OptLM , Policy
2023-02-23 17:28:30 +01:00
if shared . args . deepspeed :
import deepspeed
2023-02-23 18:41:42 +01:00
from transformers . deepspeed import ( HfDeepSpeedConfig ,
is_deepspeed_zero3_enabled )
2023-02-23 17:28:30 +01:00
from modules . deepspeed_parameters import generate_ds_config
# Distributed setup
local_rank = shared . args . local_rank if shared . args . local_rank is not None else int ( os . getenv ( " LOCAL_RANK " , " 0 " ) )
world_size = int ( os . getenv ( " WORLD_SIZE " , " 1 " ) )
torch . cuda . set_device ( local_rank )
deepspeed . init_distributed ( )
ds_config = generate_ds_config ( shared . args . bf16 , 1 * world_size , shared . args . nvme_offload_dir )
dschf = HfDeepSpeedConfig ( ds_config ) # Keep this object alive for the Transformers integration
2023-03-13 18:00:38 +01:00
2023-02-23 17:28:30 +01:00
def load_model ( model_name ) :
print ( f " Loading { model_name } ... " )
t0 = time . time ( )
2023-02-28 03:03:35 +01:00
shared . is_RWKV = model_name . lower ( ) . startswith ( ' rwkv- ' )
2023-02-23 17:28:30 +01:00
# Default settings
2023-03-13 18:00:38 +01:00
if not any ( [ shared . args . cpu , shared . args . load_in_8bit , shared . args . gptq_bits , shared . args . auto_devices , shared . args . disk , shared . args . gpu_memory is not None , shared . args . cpu_memory is not None , shared . args . deepspeed , shared . args . flexgen , shared . is_RWKV ] ) :
2023-02-23 17:28:30 +01:00
if any ( size in shared . model_name . lower ( ) for size in ( ' 13b ' , ' 20b ' , ' 30b ' ) ) :
model = AutoModelForCausalLM . from_pretrained ( Path ( f " models/ { shared . model_name } " ) , device_map = ' auto ' , load_in_8bit = True )
else :
model = AutoModelForCausalLM . from_pretrained ( Path ( f " models/ { shared . model_name } " ) , low_cpu_mem_usage = True , torch_dtype = torch . bfloat16 if shared . args . bf16 else torch . float16 ) . cuda ( )
# FlexGen
elif shared . args . flexgen :
2023-02-26 20:53:41 +01:00
# Initialize environment
env = ExecutionEnv . create ( shared . args . disk_cache_dir )
2023-02-23 17:28:30 +01:00
# Offloading policy
policy = Policy ( 1 , 1 ,
shared . args . percent [ 0 ] , shared . args . percent [ 1 ] ,
shared . args . percent [ 2 ] , shared . args . percent [ 3 ] ,
shared . args . percent [ 4 ] , shared . args . percent [ 5 ] ,
2023-03-04 05:04:02 +01:00
overlap = True , sep_layer = True , pin_weight = shared . args . pin_weight ,
2023-02-23 17:28:30 +01:00
cpu_cache_compute = False , attn_sparsity = 1.0 ,
compress_weight = shared . args . compress_weight ,
comp_weight_config = CompressionConfig (
num_bits = 4 , group_size = 64 ,
group_dim = 0 , symmetric = False ) ,
compress_cache = False ,
comp_cache_config = CompressionConfig (
num_bits = 4 , group_size = 64 ,
group_dim = 2 , symmetric = False ) )
2023-02-26 20:53:41 +01:00
model = OptLM ( f " facebook/ { shared . model_name } " , env , " models " , policy )
2023-02-23 17:28:30 +01:00
# DeepSpeed ZeRO-3
elif shared . args . deepspeed :
model = AutoModelForCausalLM . from_pretrained ( Path ( f " models/ { shared . model_name } " ) , torch_dtype = torch . bfloat16 if shared . args . bf16 else torch . float16 )
model = deepspeed . initialize ( model = model , config_params = ds_config , model_parameters = None , optimizer = None , lr_scheduler = None ) [ 0 ]
model . module . eval ( ) # Inference
print ( f " DeepSpeed ZeRO-3 is enabled: { is_deepspeed_zero3_enabled ( ) } " )
2023-02-28 03:03:35 +01:00
# RMKV model (not on HuggingFace)
elif shared . is_RWKV :
2023-03-06 12:45:49 +01:00
from modules . RWKV import RWKVModel , RWKVTokenizer
2023-02-28 03:03:35 +01:00
2023-03-01 16:08:55 +01:00
model = RWKVModel . from_pretrained ( Path ( f ' models/ { model_name } ' ) , dtype = " fp32 " if shared . args . cpu else " bf16 " if shared . args . bf16 else " fp16 " , device = " cpu " if shared . args . cpu else " cuda " )
2023-03-06 12:45:49 +01:00
tokenizer = RWKVTokenizer . from_pretrained ( Path ( ' models ' ) )
2023-03-01 16:08:55 +01:00
2023-03-06 12:45:49 +01:00
return model , tokenizer
2023-02-28 03:03:35 +01:00
2023-03-13 18:00:38 +01:00
# Quantized model
elif shared . args . gptq_bits > 0 :
2023-03-14 11:56:31 +01:00
from modules . GPTQ_loader import load_quantized
2023-03-10 13:29:09 +01:00
2023-03-13 20:11:40 +01:00
model = load_quantized ( model_name )
2023-03-09 19:50:26 +01:00
2023-02-23 17:28:30 +01:00
# Custom
else :
2023-03-16 16:42:53 +01:00
params = { " low_cpu_mem_usage " : True }
2023-02-23 17:28:30 +01:00
if not shared . args . cpu and not torch . cuda . is_available ( ) :
2023-03-15 23:31:27 +01:00
print ( " Warning: torch.cuda.is_available() returned False. \n This means that no GPU has been detected. \n Falling back to CPU mode. \n " )
2023-02-23 17:28:30 +01:00
shared . args . cpu = True
if shared . args . cpu :
2023-03-16 16:42:53 +01:00
params [ " torch_dtype " ] = torch . float32
2023-02-23 17:28:30 +01:00
else :
2023-03-16 16:42:53 +01:00
params [ " device_map " ] = ' auto '
if shared . args . load_in_8bit :
params [ ' quantization_config ' ] = BitsAndBytesConfig ( load_in_8bit = True , llm_int8_enable_fp32_cpu_offload = True )
elif shared . args . bf16 :
params [ " torch_dtype " ] = torch . bfloat16
else :
params [ " torch_dtype " ] = torch . float16
2023-02-23 17:28:30 +01:00
if shared . args . gpu_memory :
2023-02-24 12:55:09 +01:00
memory_map = shared . args . gpu_memory
2023-03-16 17:34:23 +01:00
max_memory = { }
for i in range ( len ( memory_map ) ) :
2023-03-16 16:42:53 +01:00
max_memory [ i ] = f ' { memory_map [ i ] } GiB '
max_memory [ ' cpu ' ] = f ' { shared . args . cpu_memory or 99 } GiB '
params [ ' max_memory ' ] = max_memory
else :
2023-03-16 17:34:23 +01:00
total_mem = ( torch . cuda . get_device_properties ( 0 ) . total_memory / ( 1024 * 1024 ) )
suggestion = round ( ( total_mem - 1000 ) / 1000 ) * 1000
2023-03-16 16:42:53 +01:00
if total_mem - suggestion < 800 :
2023-02-23 17:28:30 +01:00
suggestion - = 1000
suggestion = int ( round ( suggestion / 1000 ) )
print ( f " \033 [1;32;1mAuto-assiging --gpu-memory { suggestion } for your GPU to try to prevent out-of-memory errors. \n You can manually set other values. \033 [0;37;0m " )
2023-03-16 16:42:53 +01:00
2023-03-16 17:34:23 +01:00
max_memory = { 0 : f ' { suggestion } GiB ' , ' cpu ' : f ' { shared . args . cpu_memory or 99 } GiB ' }
2023-03-16 16:42:53 +01:00
params [ ' max_memory ' ] = max_memory
2023-02-23 17:28:30 +01:00
2023-03-16 16:42:53 +01:00
if shared . args . disk :
params [ " offload_folder " ] = shared . args . disk_cache_dir
checkpoint = Path ( f ' models/ { shared . model_name } ' )
if shared . args . load_in_8bit and params . get ( ' max_memory ' , None ) is not None and params [ ' device_map ' ] == ' auto ' :
config = AutoConfig . from_pretrained ( checkpoint )
with init_empty_weights ( ) :
model = AutoModelForCausalLM . from_config ( config )
model . tie_weights ( )
params [ ' device_map ' ] = infer_auto_device_map (
model ,
dtype = torch . int8 ,
max_memory = params [ ' max_memory ' ] ,
no_split_module_classes = model . _no_split_modules
)
model = AutoModelForCausalLM . from_pretrained ( checkpoint , * * params )
2023-02-23 17:28:30 +01:00
# Loading the tokenizer
2023-02-24 20:46:50 +01:00
if shared . model_name . lower ( ) . startswith ( ( ' gpt4chan ' , ' gpt-4chan ' , ' 4chan ' ) ) and Path ( " models/gpt-j-6B/ " ) . exists ( ) :
2023-02-23 17:28:30 +01:00
tokenizer = AutoTokenizer . from_pretrained ( Path ( " models/gpt-j-6B/ " ) )
else :
tokenizer = AutoTokenizer . from_pretrained ( Path ( f " models/ { shared . model_name } / " ) )
tokenizer . truncation_side = ' left '
print ( f " Loaded the model in { ( time . time ( ) - t0 ) : .2f } seconds. " )
return model , tokenizer
def load_soft_prompt ( name ) :
if name == ' None ' :
shared . soft_prompt = False
shared . soft_prompt_tensor = None
else :
with zipfile . ZipFile ( Path ( f ' softprompts/ { name } .zip ' ) ) as zf :
zf . extract ( ' tensor.npy ' )
zf . extract ( ' meta.json ' )
j = json . loads ( open ( ' meta.json ' , ' r ' ) . read ( ) )
print ( f " \n Loading the softprompt \" { name } \" . " )
for field in j :
if field != ' name ' :
if type ( j [ field ] ) is list :
print ( f " { field } : { ' , ' . join ( j [ field ] ) } " )
else :
print ( f " { field } : { j [ field ] } " )
print ( )
tensor = np . load ( ' tensor.npy ' )
Path ( ' tensor.npy ' ) . unlink ( )
Path ( ' meta.json ' ) . unlink ( )
tensor = torch . Tensor ( tensor ) . to ( device = shared . model . device , dtype = shared . model . dtype )
tensor = torch . reshape ( tensor , ( 1 , tensor . shape [ 0 ] , tensor . shape [ 1 ] ) )
shared . soft_prompt = True
shared . soft_prompt_tensor = tensor
return name