2023-02-23 12:05:25 -03:00
import argparse
2023-04-14 11:07:28 -03:00
from pathlib import Path
import yaml
2023-02-23 12:05:25 -03:00
model = None
tokenizer = None
2023-03-16 21:31:39 -03:00
model_name = " None "
2023-04-14 10:52:06 -07:00
lora_names = [ ]
2023-02-23 12:05:25 -03:00
soft_prompt_tensor = None
soft_prompt = False
2023-02-27 23:03:35 -03:00
is_RWKV = False
2023-04-03 14:59:26 -03:00
is_llamacpp = False
2023-02-23 13:42:23 -03:00
2023-02-23 15:11:18 -03:00
# Chat variables
history = { ' internal ' : [ ] , ' visible ' : [ ] }
character = ' None '
2023-02-23 15:26:41 -03:00
stop_everything = False
2023-03-13 22:28:00 -03:00
processing_message = ' *Is typing...* '
2023-02-23 15:11:18 -03:00
2023-02-24 16:46:50 -03:00
# UI elements (buttons, sliders, HTML, etc)
gradio = { }
2023-02-25 00:23:51 -03:00
# Generation input parameters
input_params = [ ]
2023-03-15 23:29:56 -03:00
# For restarting the interface
need_restart = False
2023-02-23 13:42:23 -03:00
settings = {
' max_new_tokens ' : 200 ,
' max_new_tokens_min ' : 1 ,
' max_new_tokens_max ' : 2000 ,
2023-03-31 12:22:07 -03:00
' seed ' : - 1 ,
2023-03-23 13:36:00 -03:00
' name1 ' : ' You ' ,
' name2 ' : ' Assistant ' ,
' context ' : ' This is a conversation with your Assistant. The Assistant is very helpful and is eager to chat with you and answer your questions. ' ,
2023-04-12 18:30:43 -03:00
' greeting ' : ' ' ,
2023-04-05 11:49:59 -03:00
' end_of_turn ' : ' ' ,
2023-04-11 08:30:06 -07:00
' custom_stopping_strings ' : ' ' ,
2023-03-18 10:55:57 -03:00
' stop_at_newline ' : False ,
2023-04-10 16:44:22 -03:00
' add_bos_token ' : True ,
2023-04-11 18:46:06 -03:00
' ban_eos_token ' : False ,
2023-04-16 14:24:49 -03:00
' skip_special_tokens ' : True ,
2023-04-11 18:46:06 -03:00
' truncation_length ' : 2048 ,
' truncation_length_min ' : 0 ,
' truncation_length_max ' : 4096 ,
2023-04-12 17:09:56 -03:00
' mode ' : ' cai-chat ' ,
2023-04-14 11:07:28 -03:00
' instruction_template ' : ' None ' ,
2023-02-23 13:42:23 -03:00
' chat_prompt_size ' : 2048 ,
' chat_prompt_size_min ' : 0 ,
' chat_prompt_size_max ' : 2048 ,
2023-02-25 01:42:19 -03:00
' chat_generation_attempts ' : 1 ,
' chat_generation_attempts_min ' : 1 ,
' chat_generation_attempts_max ' : 5 ,
2023-02-28 02:20:11 -03:00
' default_extensions ' : [ ] ,
' chat_default_extensions ' : [ " gallery " ] ,
2023-03-02 11:25:04 -03:00
' presets ' : {
2023-04-10 15:48:07 -03:00
' default ' : ' Default ' ,
2023-04-05 23:52:36 +02:00
' .*(alpaca|llama) ' : " LLaMA-Precise " ,
2023-03-30 17:34:44 -03:00
' .*pygmalion ' : ' NovelAI-Storywriter ' ,
2023-03-29 22:40:04 -03:00
' .*RWKV ' : ' Naive ' ,
2023-03-02 11:25:04 -03:00
} ,
' prompts ' : {
2023-03-29 22:40:04 -03:00
' default ' : ' QA ' ,
' .*(gpt4chan|gpt-4chan|4chan) ' : ' GPT-4chan ' ,
' .*oasst ' : ' Open Assistant ' ,
' .*alpaca ' : " Alpaca " ,
2023-03-17 11:24:52 -03:00
} ,
' lora_prompts ' : {
2023-03-29 22:40:04 -03:00
' default ' : ' QA ' ,
2023-04-14 10:52:06 -07:00
' .*alpaca ' : " Alpaca " ,
2023-03-02 11:25:04 -03:00
}
2023-02-23 13:42:23 -03:00
}
2023-02-23 12:05:25 -03:00
2023-04-07 00:15:45 -03:00
2023-03-04 01:04:02 -03:00
def str2bool ( v ) :
if isinstance ( v , bool ) :
return v
if v . lower ( ) in ( ' yes ' , ' true ' , ' t ' , ' y ' , ' 1 ' ) :
return True
elif v . lower ( ) in ( ' no ' , ' false ' , ' f ' , ' n ' , ' 0 ' ) :
return False
else :
raise argparse . ArgumentTypeError ( ' Boolean value expected. ' )
2023-04-07 00:15:45 -03:00
parser = argparse . ArgumentParser ( formatter_class = lambda prog : argparse . HelpFormatter ( prog , max_help_position = 54 ) )
2023-03-31 21:18:05 -03:00
# Basic settings
2023-02-23 12:05:25 -03:00
parser . add_argument ( ' --notebook ' , action = ' store_true ' , help = ' Launch the web UI in notebook mode, where the output is written to the same text box as the input. ' )
2023-04-05 11:49:59 -03:00
parser . add_argument ( ' --chat ' , action = ' store_true ' , help = ' Launch the web UI in chat mode with a style similar to the Character.AI website. ' )
parser . add_argument ( ' --cai-chat ' , action = ' store_true ' , help = ' DEPRECATED: use --chat instead. ' )
2023-03-31 21:18:05 -03:00
parser . add_argument ( ' --model ' , type = str , help = ' Name of the model to load by default. ' )
parser . add_argument ( ' --lora ' , type = str , help = ' Name of the LoRA to apply to the model by default. ' )
parser . add_argument ( " --model-dir " , type = str , default = ' models/ ' , help = " Path to directory with all the models " )
parser . add_argument ( " --lora-dir " , type = str , default = ' loras/ ' , help = " Path to directory with all the loras " )
2023-04-12 21:24:26 -03:00
parser . add_argument ( ' --model-menu ' , action = ' store_true ' , help = ' Show a model menu in the terminal when the web UI is first launched. ' )
2023-03-31 21:18:05 -03:00
parser . add_argument ( ' --no-stream ' , action = ' store_true ' , help = ' Don \' t stream the text output in real time. ' )
parser . add_argument ( ' --settings ' , type = str , help = ' Load the default interface settings from this json file. See settings-template.json for an example. If you create a file called settings.json, this file will be loaded by default without the need to use the --settings flag. ' )
parser . add_argument ( ' --extensions ' , type = str , nargs = " + " , help = ' The list of extensions to load. If you want to load more than one extension, write the names separated by spaces. ' )
parser . add_argument ( ' --verbose ' , action = ' store_true ' , help = ' Print the prompts to the terminal. ' )
# Accelerate/transformers
2023-04-10 22:29:00 +02:00
parser . add_argument ( ' --cpu ' , action = ' store_true ' , help = ' Use the CPU to generate text. Warning: Training on CPU is extremely slow. ' )
2023-03-31 21:18:05 -03:00
parser . add_argument ( ' --auto-devices ' , action = ' store_true ' , help = ' Automatically split the model across the available GPU(s) and CPU. ' )
2023-04-01 13:56:47 -03:00
parser . add_argument ( ' --gpu-memory ' , type = str , nargs = " + " , help = ' Maxmimum GPU memory in GiB to be allocated per GPU. Example: --gpu-memory 10 for a single GPU, --gpu-memory 10 5 for two GPUs. You can also set values in MiB like --gpu-memory 3500MiB. ' )
parser . add_argument ( ' --cpu-memory ' , type = str , help = ' Maximum CPU memory in GiB to allocate for offloaded weights. Same as above. ' )
2023-03-31 21:18:05 -03:00
parser . add_argument ( ' --disk ' , action = ' store_true ' , help = ' If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk. ' )
parser . add_argument ( ' --disk-cache-dir ' , type = str , default = " cache " , help = ' Directory to save the disk cache to. Defaults to " cache " . ' )
2023-02-23 12:05:25 -03:00
parser . add_argument ( ' --load-in-8bit ' , action = ' store_true ' , help = ' Load the model with 8-bit precision. ' )
2023-03-31 21:18:05 -03:00
parser . add_argument ( ' --bf16 ' , action = ' store_true ' , help = ' Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU. ' )
parser . add_argument ( ' --no-cache ' , action = ' store_true ' , help = ' Set use_cache to False while generating text. This reduces the VRAM usage a bit at a performance cost. ' )
2023-04-09 22:08:40 -04:00
parser . add_argument ( ' --xformers ' , action = ' store_true ' , help = " Use xformer ' s memory efficient attention. This should increase your tokens/s. " )
parser . add_argument ( ' --sdp-attention ' , action = ' store_true ' , help = " Use torch 2.0 ' s sdp attention. " )
2023-04-16 22:15:03 +00:00
parser . add_argument ( ' --trust-remote-code ' , action = ' store_true ' , help = " Set trust_remote_code=True while loading a model. Necessary for ChatGLM. " )
2023-03-26 00:11:33 -03:00
2023-03-31 21:18:05 -03:00
# llama.cpp
parser . add_argument ( ' --threads ' , type = int , default = 0 , help = ' Number of threads to use in llama.cpp. ' )
# GPTQ
2023-04-17 10:55:35 -03:00
parser . add_argument ( ' --wbits ' , type = int , default = 0 , help = ' Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported. ' )
parser . add_argument ( ' --model_type ' , type = str , help = ' Model type of pre-quantized model. Currently LLaMA, OPT, and GPT-J are supported. ' )
parser . add_argument ( ' --groupsize ' , type = int , default = - 1 , help = ' Group size. ' )
parser . add_argument ( ' --pre_layer ' , type = int , default = 0 , help = ' The number of layers to allocate to the GPU. Setting this parameter enables CPU offloading for 4-bit models. ' )
parser . add_argument ( ' --monkey-patch ' , action = ' store_true ' , help = ' Apply the monkey patch for using LoRAs with quantized models. ' )
parser . add_argument ( ' --no-quant_attn ' , action = ' store_true ' , help = ' (triton) Disable quant attention. If you encounter incoherent results try disabling this. ' )
parser . add_argument ( ' --no-warmup_autotune ' , action = ' store_true ' , help = ' (triton) Disable warmup autotune. ' )
parser . add_argument ( ' --no-fused_mlp ' , action = ' store_true ' , help = ' (triton) Disable fused mlp. If you encounter " Unexpected mma -> mma layout conversion " try disabling this. ' )
2023-03-26 00:11:33 -03:00
2023-03-31 21:18:05 -03:00
# FlexGen
2023-02-23 12:05:25 -03:00
parser . add_argument ( ' --flexgen ' , action = ' store_true ' , help = ' Enable the use of FlexGen offloading. ' )
2023-02-24 08:55:09 -03:00
parser . add_argument ( ' --percent ' , type = int , nargs = " + " , default = [ 0 , 100 , 100 , 0 , 100 , 0 ] , help = ' FlexGen: allocation percentages. Must be 6 numbers separated by spaces (default: 0, 100, 100, 0, 100, 0). ' )
2023-02-23 12:05:25 -03:00
parser . add_argument ( " --compress-weight " , action = " store_true " , help = " FlexGen: activate weight compression. " )
2023-03-04 01:04:02 -03:00
parser . add_argument ( " --pin-weight " , type = str2bool , nargs = " ? " , const = True , default = True , help = " FlexGen: whether to pin weights (setting this to False reduces CPU memory by 20 %% ). " )
2023-03-31 21:18:05 -03:00
# DeepSpeed
2023-02-23 12:05:25 -03:00
parser . add_argument ( ' --deepspeed ' , action = ' store_true ' , help = ' Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration. ' )
parser . add_argument ( ' --nvme-offload-dir ' , type = str , help = ' DeepSpeed: Directory to use for ZeRO-3 NVME offloading. ' )
parser . add_argument ( ' --local_rank ' , type = int , default = 0 , help = ' DeepSpeed: Optional argument for distributed setups. ' )
2023-03-31 21:18:05 -03:00
# RWKV
2023-03-06 20:12:54 -03:00
parser . add_argument ( ' --rwkv-strategy ' , type = str , default = None , help = ' RWKV: The strategy to use while loading the model. Examples: " cpu fp32 " , " cuda fp16 " , " cuda fp16i8 " . ' )
parser . add_argument ( ' --rwkv-cuda-on ' , action = ' store_true ' , help = ' RWKV: Compile the CUDA kernel for better performance. ' )
2023-03-31 21:18:05 -03:00
# Gradio
2023-02-23 12:05:25 -03:00
parser . add_argument ( ' --listen ' , action = ' store_true ' , help = ' Make the web UI reachable from your local network. ' )
2023-04-13 17:35:08 -07:00
parser . add_argument ( ' --listen-host ' , type = str , help = ' The hostname that the server will use. ' )
2023-02-23 12:05:25 -03:00
parser . add_argument ( ' --listen-port ' , type = int , help = ' The listening port that the server will use. ' )
parser . add_argument ( ' --share ' , action = ' store_true ' , help = ' Create a public URL. This is useful for running the web UI on Google Colab or similar. ' )
2023-03-13 12:44:18 -03:00
parser . add_argument ( ' --auto-launch ' , action = ' store_true ' , default = False , help = ' Open the web UI in the default browser upon launch. ' )
2023-03-27 23:39:26 -03:00
parser . add_argument ( " --gradio-auth-path " , type = str , help = ' Set the gradio authentication file path. The file should contain one or more user:password pairs in this format: " u1:p1,u2:p2,u3:p3 " ' , default = None )
2023-03-31 21:18:05 -03:00
2023-02-23 12:05:25 -03:00
args = parser . parse_args ( )
2023-04-14 15:35:06 -03:00
args_defaults = parser . parse_args ( [ ] )
2023-03-14 07:56:31 -03:00
2023-04-05 11:49:59 -03:00
# Deprecation warnings for parameters that have been renamed
2023-04-12 17:09:56 -03:00
deprecated_dict = { }
2023-03-26 00:11:33 -03:00
for k in deprecated_dict :
2023-04-16 05:36:50 +01:00
if getattr ( args , k ) != deprecated_dict [ k ] [ 1 ] :
2023-04-17 19:34:28 -03:00
print ( f " Warning: -- { k } is deprecated and will be removed. Use -- { deprecated_dict [ k ] [ 0 ] } instead. \n " )
2023-04-16 05:36:50 +01:00
setattr ( args , deprecated_dict [ k ] [ 0 ] , getattr ( args , k ) )
2023-04-01 20:14:43 -03:00
2023-04-05 11:49:59 -03:00
# Deprecation warnings for parameters that have been removed
if args . cai_chat :
2023-04-17 19:34:28 -03:00
print ( " Warning: --cai-chat is deprecated. Use --chat instead. \n " )
2023-04-05 11:49:59 -03:00
args . chat = True
2023-04-16 22:15:03 +00:00
# Security warnings
if args . trust_remote_code :
2023-04-17 19:34:28 -03:00
print ( " Warning: trust_remote_code is enabled. This is dangerous. \n " )
if args . share :
print ( " Warning: the gradio \" share link \" feature downloads a proprietary and \n unaudited blob to create a reverse tunnel. This is potentially dangerous. \n " )
2023-04-16 22:15:03 +00:00
2023-04-07 00:15:45 -03:00
2023-04-01 20:14:43 -03:00
def is_chat ( ) :
2023-04-05 11:49:59 -03:00
return args . chat
2023-04-14 11:07:28 -03:00
# Loading model-specific settings (default)
with Path ( f ' { args . model_dir } /config.yaml ' ) as p :
if p . exists ( ) :
model_config = yaml . safe_load ( open ( p , ' r ' ) . read ( ) )
else :
model_config = { }
# Applying user-defined model settings
with Path ( f ' { args . model_dir } /config-user.yaml ' ) as p :
if p . exists ( ) :
user_config = yaml . safe_load ( open ( p , ' r ' ) . read ( ) )
for k in user_config :
if k in model_config :
model_config [ k ] . update ( user_config [ k ] )
else :
model_config [ k ] = user_config [ k ]