2023-02-23 16:05:25 +01:00
import argparse
2023-09-19 22:11:46 +02:00
import sys
2023-05-12 11:09:45 +02:00
from collections import OrderedDict
2023-04-14 16:07:28 +02:00
from pathlib import Path
import yaml
2023-02-23 16:05:25 +01:00
2023-05-22 03:42:34 +02:00
from modules . logging_colors import logger
2023-08-07 03:58:59 +02:00
# Model variables
2023-02-23 16:05:25 +01:00
model = None
tokenizer = None
2023-03-17 01:31:39 +01:00
model_name = " None "
2023-08-07 03:58:59 +02:00
is_seq2seq = False
2023-07-12 20:29:43 +02:00
model_dirty_from_training = False
2023-08-07 03:58:59 +02:00
lora_names = [ ]
2023-02-23 17:42:23 +01:00
2023-08-07 03:58:59 +02:00
# Generation variables
2023-02-23 19:26:41 +01:00
stop_everything = False
2023-08-07 03:58:59 +02:00
generation_lock = None
2023-03-14 02:28:00 +01:00
processing_message = ' *Is typing...* '
2023-02-23 19:11:18 +01:00
2023-08-07 03:58:59 +02:00
# UI variables
2023-02-24 20:46:50 +01:00
gradio = { }
2023-04-24 08:05:47 +02:00
persistent_interface_state = { }
2023-03-16 03:29:56 +01:00
need_restart = False
2023-08-03 06:13:16 +02:00
2023-08-07 03:58:59 +02:00
# UI defaults
2023-02-23 17:42:23 +01:00
settings = {
2023-07-26 05:11:57 +02:00
' dark_theme ' : True ,
2023-08-16 16:03:53 +02:00
' show_controls ' : True ,
2023-08-14 16:46:07 +02:00
' start_with ' : ' ' ,
' mode ' : ' chat ' ,
2023-09-12 04:30:23 +02:00
' chat_style ' : ' cai-chat ' ,
2023-08-14 16:46:07 +02:00
' prompt-default ' : ' QA ' ,
' prompt-notebook ' : ' QA ' ,
' preset ' : ' simple-1 ' ,
2023-02-23 17:42:23 +01:00
' max_new_tokens ' : 200 ,
' max_new_tokens_min ' : 1 ,
2023-07-18 02:08:22 +02:00
' max_new_tokens_max ' : 4096 ,
2023-03-31 17:22:07 +02:00
' seed ' : - 1 ,
2023-08-06 22:22:48 +02:00
' negative_prompt ' : ' ' ,
2023-08-14 16:46:07 +02:00
' truncation_length ' : 2048 ,
' truncation_length_min ' : 0 ,
2023-09-29 04:28:22 +02:00
' truncation_length_max ' : 32768 ,
2023-08-14 16:46:07 +02:00
' custom_stopping_strings ' : ' ' ,
' auto_max_new_tokens ' : False ,
2023-08-29 22:44:31 +02:00
' max_tokens_second ' : 0 ,
2023-08-14 16:46:07 +02:00
' ban_eos_token ' : False ,
2023-09-15 23:27:27 +02:00
' custom_token_bans ' : ' ' ,
2023-08-14 16:46:07 +02:00
' add_bos_token ' : True ,
' skip_special_tokens ' : True ,
' stream ' : True ,
2023-03-23 17:36:00 +01:00
' name1 ' : ' You ' ,
2023-09-21 22:19:32 +02:00
' character ' : ' Assistant ' ,
2023-08-29 22:06:25 +02:00
' instruction_template ' : ' Alpaca ' ,
2023-05-14 15:43:55 +02:00
' chat-instruct_command ' : ' Continue the chat dialogue below. Write a single reply for the character " <|character|> " . \n \n <|prompt|> ' ,
2023-08-14 16:46:07 +02:00
' autoload_model ' : False ,
2023-08-13 06:12:15 +02:00
' default_extensions ' : [ ' gallery ' ] ,
2023-02-23 17:42:23 +01:00
}
2023-02-23 16:05:25 +01:00
2023-04-07 05:15:45 +02:00
2023-03-04 05:04:02 +01:00
def str2bool ( v ) :
if isinstance ( v , bool ) :
return v
if v . lower ( ) in ( ' yes ' , ' true ' , ' t ' , ' y ' , ' 1 ' ) :
return True
elif v . lower ( ) in ( ' no ' , ' false ' , ' f ' , ' n ' , ' 0 ' ) :
return False
else :
raise argparse . ArgumentTypeError ( ' Boolean value expected. ' )
2023-04-07 05:15:45 +02:00
parser = argparse . ArgumentParser ( formatter_class = lambda prog : argparse . HelpFormatter ( prog , max_help_position = 54 ) )
2023-04-01 02:18:05 +02:00
# Basic settings
2023-08-13 06:12:15 +02:00
parser . add_argument ( ' --notebook ' , action = ' store_true ' , help = ' DEPRECATED ' )
parser . add_argument ( ' --chat ' , action = ' store_true ' , help = ' DEPRECATED ' )
2023-07-04 05:03:30 +02:00
parser . add_argument ( ' --multi-user ' , action = ' store_true ' , help = ' Multi-user mode. Chat histories are not saved or automatically loaded. WARNING: this is highly experimental. ' )
2023-04-24 18:19:42 +02:00
parser . add_argument ( ' --character ' , type = str , help = ' The name of the character to load in chat mode by default. ' )
2023-04-01 02:18:05 +02:00
parser . add_argument ( ' --model ' , type = str , help = ' Name of the model to load by default. ' )
2023-04-26 03:58:48 +02:00
parser . add_argument ( ' --lora ' , type = str , nargs = " + " , help = ' The list of LoRAs to load. If you want to load more than one LoRA, write the names separated by spaces. ' )
2023-04-01 02:18:05 +02:00
parser . add_argument ( " --model-dir " , type = str , default = ' models/ ' , help = " Path to directory with all the models " )
parser . add_argument ( " --lora-dir " , type = str , default = ' loras/ ' , help = " Path to directory with all the loras " )
2023-04-13 02:24:26 +02:00
parser . add_argument ( ' --model-menu ' , action = ' store_true ' , help = ' Show a model menu in the terminal when the web UI is first launched. ' )
2023-08-14 16:46:07 +02:00
parser . add_argument ( ' --no-stream ' , action = ' store_true ' , help = ' DEPRECATED ' )
2023-05-29 03:34:12 +02:00
parser . add_argument ( ' --settings ' , type = str , help = ' Load the default interface settings from this yaml file. See settings-template.yaml for an example. If you create a file called settings.yaml, this file will be loaded by default without the need to use the --settings flag. ' )
2023-04-01 02:18:05 +02:00
parser . add_argument ( ' --extensions ' , type = str , nargs = " + " , help = ' The list of extensions to load. If you want to load more than one extension, write the names separated by spaces. ' )
parser . add_argument ( ' --verbose ' , action = ' store_true ' , help = ' Print the prompts to the terminal. ' )
2023-09-16 05:39:37 +02:00
parser . add_argument ( ' --chat-buttons ' , action = ' store_true ' , help = ' Show buttons on chat tab instead of hover menu. ' )
2023-04-01 02:18:05 +02:00
2023-06-17 00:00:37 +02:00
# Model loader
2023-07-26 00:15:29 +02:00
parser . add_argument ( ' --loader ' , type = str , help = ' Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, exllama_hf, llamacpp, rwkv ' )
2023-06-17 00:00:37 +02:00
2023-04-01 02:18:05 +02:00
# Accelerate/transformers
2023-04-10 22:29:00 +02:00
parser . add_argument ( ' --cpu ' , action = ' store_true ' , help = ' Use the CPU to generate text. Warning: Training on CPU is extremely slow. ' )
2023-04-01 02:18:05 +02:00
parser . add_argument ( ' --auto-devices ' , action = ' store_true ' , help = ' Automatically split the model across the available GPU(s) and CPU. ' )
2023-06-01 17:08:39 +02:00
parser . add_argument ( ' --gpu-memory ' , type = str , nargs = " + " , help = ' Maximum GPU memory in GiB to be allocated per GPU. Example: --gpu-memory 10 for a single GPU, --gpu-memory 10 5 for two GPUs. You can also set values in MiB like --gpu-memory 3500MiB. ' )
2023-04-01 18:56:47 +02:00
parser . add_argument ( ' --cpu-memory ' , type = str , help = ' Maximum CPU memory in GiB to allocate for offloaded weights. Same as above. ' )
2023-04-01 02:18:05 +02:00
parser . add_argument ( ' --disk ' , action = ' store_true ' , help = ' If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk. ' )
parser . add_argument ( ' --disk-cache-dir ' , type = str , default = " cache " , help = ' Directory to save the disk cache to. Defaults to " cache " . ' )
2023-05-25 06:14:13 +02:00
parser . add_argument ( ' --load-in-8bit ' , action = ' store_true ' , help = ' Load the model with 8-bit precision (using bitsandbytes). ' )
2023-04-01 02:18:05 +02:00
parser . add_argument ( ' --bf16 ' , action = ' store_true ' , help = ' Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU. ' )
parser . add_argument ( ' --no-cache ' , action = ' store_true ' , help = ' Set use_cache to False while generating text. This reduces the VRAM usage a bit at a performance cost. ' )
2023-04-10 04:08:40 +02:00
parser . add_argument ( ' --xformers ' , action = ' store_true ' , help = " Use xformer ' s memory efficient attention. This should increase your tokens/s. " )
parser . add_argument ( ' --sdp-attention ' , action = ' store_true ' , help = " Use torch 2.0 ' s sdp attention. " )
2023-05-29 15:20:18 +02:00
parser . add_argument ( ' --trust-remote-code ' , action = ' store_true ' , help = " Set trust_remote_code=True while loading a model. Necessary for ChatGLM and Falcon. " )
2023-09-25 21:19:43 +02:00
parser . add_argument ( ' --use_fast ' , action = ' store_true ' , help = " Set use_fast=True while loading a tokenizer. " )
2023-03-26 05:11:33 +02:00
2023-05-25 06:14:13 +02:00
# Accelerate 4-bit
parser . add_argument ( ' --load-in-4bit ' , action = ' store_true ' , help = ' Load the model with 4-bit precision (using bitsandbytes). ' )
2023-05-25 20:05:53 +02:00
parser . add_argument ( ' --compute_dtype ' , type = str , default = " float16 " , help = " compute dtype for 4-bit. Valid options: bfloat16, float16, float32. " )
2023-05-25 06:14:13 +02:00
parser . add_argument ( ' --quant_type ' , type = str , default = " nf4 " , help = ' quant_type for 4-bit. Valid options: nf4, fp4. ' )
parser . add_argument ( ' --use_double_quant ' , action = ' store_true ' , help = ' use_double_quant for 4-bit. ' )
2023-04-01 02:18:05 +02:00
# llama.cpp
2023-05-02 23:25:28 +02:00
parser . add_argument ( ' --threads ' , type = int , default = 0 , help = ' Number of threads to use. ' )
2023-10-02 06:27:04 +02:00
parser . add_argument ( ' --threads-batch ' , type = int , default = 0 , help = ' Number of threads to use for batches/prompt processing. ' )
2023-05-02 23:25:28 +02:00
parser . add_argument ( ' --n_batch ' , type = int , default = 512 , help = ' Maximum number of prompt tokens to batch together when calling llama_eval. ' )
2023-05-03 14:50:31 +02:00
parser . add_argument ( ' --no-mmap ' , action = ' store_true ' , help = ' Prevent mmap from being used. ' )
2023-05-02 23:25:28 +02:00
parser . add_argument ( ' --mlock ' , action = ' store_true ' , help = ' Force the system to keep the model in RAM. ' )
2023-08-18 17:03:34 +02:00
parser . add_argument ( ' --mul_mat_q ' , action = ' store_true ' , help = ' Activate new mulmat kernels. ' )
2023-05-16 01:19:55 +02:00
parser . add_argument ( ' --cache-capacity ' , type = str , help = ' Maximum cache capacity. Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed. ' )
2023-05-15 03:58:11 +02:00
parser . add_argument ( ' --n-gpu-layers ' , type = int , default = 0 , help = ' Number of layers to offload to the GPU. ' )
2023-08-18 17:03:34 +02:00
parser . add_argument ( ' --tensor_split ' , type = str , default = None , help = " Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17 " )
2023-05-25 15:29:31 +02:00
parser . add_argument ( ' --n_ctx ' , type = int , default = 2048 , help = ' Size of the prompt context. ' )
parser . add_argument ( ' --llama_cpp_seed ' , type = int , default = 0 , help = ' Seed for llama-cpp models. Default 0 (random) ' )
2023-09-27 03:05:00 +02:00
parser . add_argument ( ' --numa ' , action = ' store_true ' , help = ' Activate NUMA task allocation for llama.cpp ' )
2023-04-01 02:18:05 +02:00
# GPTQ
2023-04-17 15:55:35 +02:00
parser . add_argument ( ' --wbits ' , type = int , default = 0 , help = ' Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported. ' )
parser . add_argument ( ' --model_type ' , type = str , help = ' Model type of pre-quantized model. Currently LLaMA, OPT, and GPT-J are supported. ' )
parser . add_argument ( ' --groupsize ' , type = int , default = - 1 , help = ' Group size. ' )
2023-05-17 15:41:09 +02:00
parser . add_argument ( ' --pre_layer ' , type = int , nargs = " + " , help = ' The number of layers to allocate to the GPU. Setting this parameter enables CPU offloading for 4-bit models. For multi-gpu, write the numbers separated by spaces, eg --pre_layer 30 60. ' )
2023-05-04 20:17:20 +02:00
parser . add_argument ( ' --checkpoint ' , type = str , help = ' The path to the quantized checkpoint file. If not specified, it will be automatically detected. ' )
2023-04-17 15:55:35 +02:00
parser . add_argument ( ' --monkey-patch ' , action = ' store_true ' , help = ' Apply the monkey patch for using LoRAs with quantized models. ' )
2023-03-26 05:11:33 +02:00
2023-05-17 16:12:12 +02:00
# AutoGPTQ
parser . add_argument ( ' --triton ' , action = ' store_true ' , help = ' Use triton. ' )
2023-06-16 04:59:54 +02:00
parser . add_argument ( ' --no_inject_fused_attention ' , action = ' store_true ' , help = ' Do not use fused attention (lowers VRAM requirements). ' )
parser . add_argument ( ' --no_inject_fused_mlp ' , action = ' store_true ' , help = ' Triton mode only: Do not use fused MLP (lowers VRAM requirements). ' )
2023-06-23 17:22:56 +02:00
parser . add_argument ( ' --no_use_cuda_fp16 ' , action = ' store_true ' , help = ' This can make models faster on some systems. ' )
2023-06-02 06:33:55 +02:00
parser . add_argument ( ' --desc_act ' , action = ' store_true ' , help = ' For models that don \' t have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig. ' )
2023-08-12 08:26:58 +02:00
parser . add_argument ( ' --disable_exllama ' , action = ' store_true ' , help = ' Disable ExLlama kernel, which can improve inference speed on some systems. ' )
2023-05-17 16:12:12 +02:00
2023-06-17 01:49:36 +02:00
# ExLlama
parser . add_argument ( ' --gpu-split ' , type = str , help = " Comma-separated list of VRAM (in GB) to use per GPU device for model layers, e.g. 20,7,7 " )
2023-06-26 03:49:26 +02:00
parser . add_argument ( ' --max_seq_len ' , type = int , default = 2048 , help = " Maximum sequence length. " )
2023-08-24 21:27:36 +02:00
parser . add_argument ( ' --cfg-cache ' , action = ' store_true ' , help = " ExLlama_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader, but not necessary for CFG with base ExLlama. " )
2023-06-17 01:49:36 +02:00
2023-04-01 02:18:05 +02:00
# DeepSpeed
2023-02-23 16:05:25 +01:00
parser . add_argument ( ' --deepspeed ' , action = ' store_true ' , help = ' Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration. ' )
parser . add_argument ( ' --nvme-offload-dir ' , type = str , help = ' DeepSpeed: Directory to use for ZeRO-3 NVME offloading. ' )
parser . add_argument ( ' --local_rank ' , type = int , default = 0 , help = ' DeepSpeed: Optional argument for distributed setups. ' )
2023-04-01 02:18:05 +02:00
# RWKV
2023-03-07 00:12:54 +01:00
parser . add_argument ( ' --rwkv-strategy ' , type = str , default = None , help = ' RWKV: The strategy to use while loading the model. Examples: " cpu fp32 " , " cuda fp16 " , " cuda fp16i8 " . ' )
parser . add_argument ( ' --rwkv-cuda-on ' , action = ' store_true ' , help = ' RWKV: Compile the CUDA kernel for better performance. ' )
2023-04-01 02:18:05 +02:00
2023-07-25 23:58:13 +02:00
# RoPE
2023-08-27 23:54:43 +02:00
parser . add_argument ( ' --alpha_value ' , type = float , default = 1 , help = " Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both. " )
2023-08-25 16:08:38 +02:00
parser . add_argument ( ' --rope_freq_base ' , type = int , default = 0 , help = " If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63). " )
2023-08-25 15:53:37 +02:00
parser . add_argument ( ' --compress_pos_emb ' , type = int , default = 1 , help = " Positional embeddings compression factor. Should be set to (context length) / (model \' s original context length). Equal to 1/rope_freq_scale. " )
2023-07-25 23:58:13 +02:00
2023-04-01 02:18:05 +02:00
# Gradio
2023-02-23 16:05:25 +01:00
parser . add_argument ( ' --listen ' , action = ' store_true ' , help = ' Make the web UI reachable from your local network. ' )
2023-04-14 02:35:08 +02:00
parser . add_argument ( ' --listen-host ' , type = str , help = ' The hostname that the server will use. ' )
2023-02-23 16:05:25 +01:00
parser . add_argument ( ' --listen-port ' , type = int , help = ' The listening port that the server will use. ' )
parser . add_argument ( ' --share ' , action = ' store_true ' , help = ' Create a public URL. This is useful for running the web UI on Google Colab or similar. ' )
2023-03-13 16:44:18 +01:00
parser . add_argument ( ' --auto-launch ' , action = ' store_true ' , default = False , help = ' Open the web UI in the default browser upon launch. ' )
2023-05-24 01:39:26 +02:00
parser . add_argument ( " --gradio-auth " , type = str , help = ' set gradio authentication like " username:password " ; or comma-delimit multiple like " u1:p1,u2:p2,u3:p3 " ' , default = None )
2023-03-28 04:39:26 +02:00
parser . add_argument ( " --gradio-auth-path " , type = str , help = ' Set the gradio authentication file path. The file should contain one or more user:password pairs in this format: " u1:p1,u2:p2,u3:p3 " ' , default = None )
2023-08-04 18:57:31 +02:00
parser . add_argument ( " --ssl-keyfile " , type = str , help = ' The path to the SSL certificate key file. ' , default = None )
parser . add_argument ( " --ssl-certfile " , type = str , help = ' The path to the SSL certificate cert file. ' , default = None )
2023-04-01 02:18:05 +02:00
2023-04-23 20:52:43 +02:00
# API
parser . add_argument ( ' --api ' , action = ' store_true ' , help = ' Enable the API extension. ' )
2023-05-16 01:44:16 +02:00
parser . add_argument ( ' --api-blocking-port ' , type = int , default = 5000 , help = ' The listening port for the blocking API. ' )
2023-07-12 20:33:25 +02:00
parser . add_argument ( ' --api-streaming-port ' , type = int , default = 5005 , help = ' The listening port for the streaming API. ' )
2023-04-23 20:52:43 +02:00
parser . add_argument ( ' --public-api ' , action = ' store_true ' , help = ' Create a public URL for the API using Cloudfare. ' )
2023-08-09 03:20:27 +02:00
parser . add_argument ( ' --public-api-id ' , type = str , help = ' Tunnel ID for named Cloudflare Tunnel. Use together with public-api option. ' , default = None )
2023-04-23 20:52:43 +02:00
2023-05-10 01:18:02 +02:00
# Multimodal
parser . add_argument ( ' --multimodal-pipeline ' , type = str , default = None , help = ' The multimodal pipeline to use. Examples: llava-7b, llava-13b. ' )
2023-04-23 20:52:43 +02:00
2023-02-23 16:05:25 +01:00
args = parser . parse_args ( )
2023-04-14 20:35:06 +02:00
args_defaults = parser . parse_args ( [ ] )
2023-09-19 22:11:46 +02:00
provided_arguments = [ ]
for arg in sys . argv [ 1 : ] :
arg = arg . lstrip ( ' - ' ) . replace ( ' - ' , ' _ ' )
if hasattr ( args , arg ) :
provided_arguments . append ( arg )
2023-03-14 11:56:31 +01:00
2023-08-13 06:12:15 +02:00
# Deprecation warnings
2023-08-14 16:46:07 +02:00
for k in [ ' chat ' , ' notebook ' , ' no_stream ' ] :
2023-08-13 06:12:15 +02:00
if getattr ( args , k ) :
2023-08-14 17:45:58 +02:00
logger . warning ( f ' The -- { k } flag has been deprecated and will be removed soon. Please remove that flag. ' )
2023-08-13 06:12:15 +02:00
2023-04-17 00:15:03 +02:00
# Security warnings
if args . trust_remote_code :
2023-05-22 03:42:34 +02:00
logger . warning ( " trust_remote_code is enabled. This is dangerous. " )
2023-04-18 00:34:28 +02:00
if args . share :
2023-05-29 03:48:20 +02:00
logger . warning ( " The gradio \" share link \" feature uses a proprietary executable to create a reverse tunnel. Use it with care. " )
2023-09-26 05:07:22 +02:00
if any ( ( args . listen , args . share ) ) and not any ( ( args . gradio_auth , args . gradio_auth_path ) ) :
2023-09-26 14:44:04 +02:00
logger . warning ( " \n You are potentially exposing the web UI to the entire internet without any access password. \n You can create one with the \" --gradio-auth \" flag like this: \n \n --gradio-auth username:password \n \n Make sure to replace username:password with your own. " )
if args . multi_user :
logger . warning ( " \n The multi-user mode is highly experimental and should not be shared publicly. " )
2023-04-17 00:15:03 +02:00
2023-05-10 01:18:02 +02:00
2023-06-17 00:00:37 +02:00
def fix_loader_name ( name ) :
2023-08-07 05:36:35 +02:00
if not name :
return name
2023-06-17 00:00:37 +02:00
name = name . lower ( )
if name in [ ' llamacpp ' , ' llama.cpp ' , ' llama-cpp ' , ' llama cpp ' ] :
return ' llama.cpp '
2023-07-16 07:21:13 +02:00
if name in [ ' llamacpp_hf ' , ' llama.cpp_hf ' , ' llama-cpp-hf ' , ' llamacpp-hf ' , ' llama.cpp-hf ' ] :
return ' llamacpp_HF '
2023-06-17 00:00:37 +02:00
elif name in [ ' transformers ' , ' huggingface ' , ' hf ' , ' hugging_face ' , ' hugging face ' ] :
return ' Transformers '
elif name in [ ' autogptq ' , ' auto-gptq ' , ' auto_gptq ' , ' auto gptq ' ] :
return ' AutoGPTQ '
elif name in [ ' gptq-for-llama ' , ' gptqforllama ' , ' gptqllama ' , ' gptq for llama ' , ' gptq_for_llama ' ] :
return ' GPTQ-for-LLaMa '
2023-06-17 01:35:38 +02:00
elif name in [ ' exllama ' , ' ex-llama ' , ' ex_llama ' , ' exlama ' ] :
return ' ExLlama '
2023-06-21 20:31:42 +02:00
elif name in [ ' exllama-hf ' , ' exllama_hf ' , ' exllama hf ' , ' ex-llama-hf ' , ' ex_llama_hf ' ] :
return ' ExLlama_HF '
2023-09-13 00:05:21 +02:00
elif name in [ ' exllamav2 ' , ' exllama-v2 ' , ' ex_llama-v2 ' , ' exlamav2 ' , ' exlama-v2 ' , ' exllama2 ' , ' exllama-2 ' ] :
2023-09-12 19:33:07 +02:00
return ' ExLlamav2 '
2023-09-13 00:05:21 +02:00
elif name in [ ' exllamav2-hf ' , ' exllamav2_hf ' , ' exllama-v2-hf ' , ' exllama_v2_hf ' , ' exllama-v2_hf ' , ' exllama2-hf ' , ' exllama2_hf ' , ' exllama-2-hf ' , ' exllama_2_hf ' , ' exllama-2_hf ' ] :
2023-09-12 19:33:07 +02:00
return ' ExLlamav2_HF '
2023-08-11 19:41:33 +02:00
elif name in [ ' ctransformers ' , ' ctranforemrs ' , ' ctransformer ' ] :
return ' ctransformers '
2023-10-05 18:19:18 +02:00
elif name in [ ' autoawq ' , ' awq ' , ' auto-awq ' ] :
return ' AutoAWQ '
2023-06-17 00:00:37 +02:00
2023-05-10 01:18:02 +02:00
def add_extension ( name ) :
2023-04-23 20:52:43 +02:00
if args . extensions is None :
2023-05-10 01:18:02 +02:00
args . extensions = [ name ]
2023-04-23 20:52:43 +02:00
elif ' api ' not in args . extensions :
2023-05-10 01:18:02 +02:00
args . extensions . append ( name )
2023-04-02 01:14:43 +02:00
def is_chat ( ) :
2023-08-13 06:12:15 +02:00
return True
2023-07-04 05:03:30 +02:00
2023-08-07 03:58:59 +02:00
args . loader = fix_loader_name ( args . loader )
# Activate the API extension
if args . api or args . public_api :
add_extension ( ' api ' )
# Activate the multimodal extension
if args . multimodal_pipeline is not None :
add_extension ( ' multimodal ' )
# Load model-specific settings
2023-04-14 16:07:28 +02:00
with Path ( f ' { args . model_dir } /config.yaml ' ) as p :
if p . exists ( ) :
model_config = yaml . safe_load ( open ( p , ' r ' ) . read ( ) )
else :
model_config = { }
2023-08-07 03:58:59 +02:00
# Load custom model-specific settings
2023-04-14 16:07:28 +02:00
with Path ( f ' { args . model_dir } /config-user.yaml ' ) as p :
if p . exists ( ) :
user_config = yaml . safe_load ( open ( p , ' r ' ) . read ( ) )
2023-09-14 21:15:52 +02:00
else :
user_config = { }
2023-05-12 11:09:45 +02:00
model_config = OrderedDict ( model_config )
2023-09-14 21:15:52 +02:00
user_config = OrderedDict ( user_config )