2023-08-07 02:49:27 +02:00
import importlib
import math
import re
import traceback
from functools import partial
import gradio as gr
import psutil
import torch
from modules import loaders , shared , ui , utils
from modules . logging_colors import logger
from modules . LoRA import add_lora_to_model
from modules . models import load_model , unload_model
from modules . models_settings import (
apply_model_settings_to_state ,
save_model_settings ,
update_model_parameters
)
from modules . utils import gradio
def create_ui ( ) :
# Finding the default values for the GPU and CPU memories
total_mem = [ ]
for i in range ( torch . cuda . device_count ( ) ) :
total_mem . append ( math . floor ( torch . cuda . get_device_properties ( i ) . total_memory / ( 1024 * 1024 ) ) )
default_gpu_mem = [ ]
if shared . args . gpu_memory is not None and len ( shared . args . gpu_memory ) > 0 :
for i in shared . args . gpu_memory :
if ' mib ' in i . lower ( ) :
default_gpu_mem . append ( int ( re . sub ( ' [a-zA-Z ] ' , ' ' , i ) ) )
else :
default_gpu_mem . append ( int ( re . sub ( ' [a-zA-Z ] ' , ' ' , i ) ) * 1000 )
while len ( default_gpu_mem ) < len ( total_mem ) :
default_gpu_mem . append ( 0 )
total_cpu_mem = math . floor ( psutil . virtual_memory ( ) . total / ( 1024 * 1024 ) )
if shared . args . cpu_memory is not None :
default_cpu_mem = re . sub ( ' [a-zA-Z ] ' , ' ' , shared . args . cpu_memory )
else :
default_cpu_mem = 0
with gr . Tab ( " Model " , elem_id = " model-tab " ) :
with gr . Row ( ) :
with gr . Column ( ) :
with gr . Row ( ) :
with gr . Column ( ) :
with gr . Row ( ) :
shared . gradio [ ' model_menu ' ] = gr . Dropdown ( choices = utils . get_available_models ( ) , value = shared . model_name , label = ' Model ' , elem_classes = ' slim-dropdown ' )
ui . create_refresh_button ( shared . gradio [ ' model_menu ' ] , lambda : None , lambda : { ' choices ' : utils . get_available_models ( ) } , ' refresh-button ' )
shared . gradio [ ' load_model ' ] = gr . Button ( " Load " , visible = not shared . settings [ ' autoload_model ' ] , elem_classes = ' refresh-button ' )
shared . gradio [ ' unload_model ' ] = gr . Button ( " Unload " , elem_classes = ' refresh-button ' )
shared . gradio [ ' reload_model ' ] = gr . Button ( " Reload " , elem_classes = ' refresh-button ' )
shared . gradio [ ' save_model_settings ' ] = gr . Button ( " Save settings " , elem_classes = ' refresh-button ' )
with gr . Column ( ) :
with gr . Row ( ) :
shared . gradio [ ' lora_menu ' ] = gr . Dropdown ( multiselect = True , choices = utils . get_available_loras ( ) , value = shared . lora_names , label = ' LoRA(s) ' , elem_classes = ' slim-dropdown ' )
ui . create_refresh_button ( shared . gradio [ ' lora_menu ' ] , lambda : None , lambda : { ' choices ' : utils . get_available_loras ( ) , ' value ' : shared . lora_names } , ' refresh-button ' )
shared . gradio [ ' lora_menu_apply ' ] = gr . Button ( value = ' Apply LoRAs ' , elem_classes = ' refresh-button ' )
with gr . Row ( ) :
with gr . Column ( ) :
shared . gradio [ ' loader ' ] = gr . Dropdown ( label = " Model loader " , choices = [ " Transformers " , " ExLlama_HF " , " ExLlama " , " AutoGPTQ " , " GPTQ-for-LLaMa " , " llama.cpp " , " llamacpp_HF " ] , value = None )
with gr . Box ( ) :
with gr . Row ( ) :
with gr . Column ( ) :
for i in range ( len ( total_mem ) ) :
shared . gradio [ f ' gpu_memory_ { i } ' ] = gr . Slider ( label = f " gpu-memory in MiB for device : { i } " , maximum = total_mem [ i ] , value = default_gpu_mem [ i ] )
shared . gradio [ ' cpu_memory ' ] = gr . Slider ( label = " cpu-memory in MiB " , maximum = total_cpu_mem , value = default_cpu_mem )
shared . gradio [ ' transformers_info ' ] = gr . Markdown ( ' load-in-4bit params: ' )
shared . gradio [ ' compute_dtype ' ] = gr . Dropdown ( label = " compute_dtype " , choices = [ " bfloat16 " , " float16 " , " float32 " ] , value = shared . args . compute_dtype )
shared . gradio [ ' quant_type ' ] = gr . Dropdown ( label = " quant_type " , choices = [ " nf4 " , " fp4 " ] , value = shared . args . quant_type )
shared . gradio [ ' n_gpu_layers ' ] = gr . Slider ( label = " n-gpu-layers " , minimum = 0 , maximum = 128 , value = shared . args . n_gpu_layers )
shared . gradio [ ' n_ctx ' ] = gr . Slider ( minimum = 0 , maximum = 16384 , step = 256 , label = " n_ctx " , value = shared . args . n_ctx )
shared . gradio [ ' threads ' ] = gr . Slider ( label = " threads " , minimum = 0 , step = 1 , maximum = 32 , value = shared . args . threads )
shared . gradio [ ' n_batch ' ] = gr . Slider ( label = " n_batch " , minimum = 1 , maximum = 2048 , value = shared . args . n_batch )
shared . gradio [ ' n_gqa ' ] = gr . Slider ( minimum = 0 , maximum = 16 , step = 1 , label = " n_gqa " , value = shared . args . n_gqa , info = ' grouped-query attention. Must be 8 for llama-2 70b. ' )
2023-08-08 05:31:11 +02:00
shared . gradio [ ' rms_norm_eps ' ] = gr . Slider ( minimum = 0 , maximum = 1e-5 , step = 1e-6 , label = " rms_norm_eps " , value = shared . args . rms_norm_eps , info = ' 5e-6 is a good value for llama-2 models. ' )
2023-08-07 02:49:27 +02:00
shared . gradio [ ' wbits ' ] = gr . Dropdown ( label = " wbits " , choices = [ " None " , 1 , 2 , 3 , 4 , 8 ] , value = str ( shared . args . wbits ) if shared . args . wbits > 0 else " None " )
shared . gradio [ ' groupsize ' ] = gr . Dropdown ( label = " groupsize " , choices = [ " None " , 32 , 64 , 128 , 1024 ] , value = str ( shared . args . groupsize ) if shared . args . groupsize > 0 else " None " )
shared . gradio [ ' model_type ' ] = gr . Dropdown ( label = " model_type " , choices = [ " None " , " llama " , " opt " , " gptj " ] , value = shared . args . model_type or " None " )
shared . gradio [ ' pre_layer ' ] = gr . Slider ( label = " pre_layer " , minimum = 0 , maximum = 100 , value = shared . args . pre_layer [ 0 ] if shared . args . pre_layer is not None else 0 )
shared . gradio [ ' autogptq_info ' ] = gr . Markdown ( ' * ExLlama_HF is recommended over AutoGPTQ for models derived from LLaMA. ' )
shared . gradio [ ' gpu_split ' ] = gr . Textbox ( label = ' gpu-split ' , info = ' Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7 ' )
shared . gradio [ ' max_seq_len ' ] = gr . Slider ( label = ' max_seq_len ' , minimum = 0 , maximum = 16384 , step = 256 , info = ' Maximum sequence length. ' , value = shared . args . max_seq_len )
2023-08-09 06:24:28 +02:00
shared . gradio [ ' alpha_value ' ] = gr . Slider ( label = ' alpha_value ' , minimum = 1 , maximum = 8 , step = 0.1 , info = ' Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both. ' , value = shared . args . alpha_value )
2023-08-07 02:49:27 +02:00
shared . gradio [ ' compress_pos_emb ' ] = gr . Slider ( label = ' compress_pos_emb ' , minimum = 1 , maximum = 8 , step = 1 , info = ' Positional embeddings compression factor. Should typically be set to max_seq_len / 2048. ' , value = shared . args . compress_pos_emb )
with gr . Column ( ) :
shared . gradio [ ' triton ' ] = gr . Checkbox ( label = " triton " , value = shared . args . triton )
shared . gradio [ ' no_inject_fused_attention ' ] = gr . Checkbox ( label = " no_inject_fused_attention " , value = shared . args . no_inject_fused_attention , info = ' Disable fused attention. Fused attention improves inference performance but uses more VRAM. Disable if running low on VRAM. ' )
shared . gradio [ ' no_inject_fused_mlp ' ] = gr . Checkbox ( label = " no_inject_fused_mlp " , value = shared . args . no_inject_fused_mlp , info = ' Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM. ' )
shared . gradio [ ' no_use_cuda_fp16 ' ] = gr . Checkbox ( label = " no_use_cuda_fp16 " , value = shared . args . no_use_cuda_fp16 , info = ' This can make models faster on some systems. ' )
shared . gradio [ ' desc_act ' ] = gr . Checkbox ( label = " desc_act " , value = shared . args . desc_act , info = ' \' desc_act \' , \' wbits \' , and \' groupsize \' are used for old models without a quantize_config.json. ' )
shared . gradio [ ' cpu ' ] = gr . Checkbox ( label = " cpu " , value = shared . args . cpu )
shared . gradio [ ' load_in_8bit ' ] = gr . Checkbox ( label = " load-in-8bit " , value = shared . args . load_in_8bit )
shared . gradio [ ' bf16 ' ] = gr . Checkbox ( label = " bf16 " , value = shared . args . bf16 )
shared . gradio [ ' auto_devices ' ] = gr . Checkbox ( label = " auto-devices " , value = shared . args . auto_devices )
shared . gradio [ ' disk ' ] = gr . Checkbox ( label = " disk " , value = shared . args . disk )
shared . gradio [ ' load_in_4bit ' ] = gr . Checkbox ( label = " load-in-4bit " , value = shared . args . load_in_4bit )
shared . gradio [ ' use_double_quant ' ] = gr . Checkbox ( label = " use_double_quant " , value = shared . args . use_double_quant )
shared . gradio [ ' no_mmap ' ] = gr . Checkbox ( label = " no-mmap " , value = shared . args . no_mmap )
shared . gradio [ ' low_vram ' ] = gr . Checkbox ( label = " low-vram " , value = shared . args . low_vram )
shared . gradio [ ' mlock ' ] = gr . Checkbox ( label = " mlock " , value = shared . args . mlock )
shared . gradio [ ' llama_cpp_seed ' ] = gr . Number ( label = ' Seed (0 for random) ' , value = shared . args . llama_cpp_seed )
shared . gradio [ ' trust_remote_code ' ] = gr . Checkbox ( label = " trust-remote-code " , value = shared . args . trust_remote_code , info = ' Make sure to inspect the .py files inside the model folder before loading it with this option enabled. ' )
shared . gradio [ ' gptq_for_llama_info ' ] = gr . Markdown ( ' GPTQ-for-LLaMa is currently 2x faster than AutoGPTQ on some systems. It is installed by default with the one-click installers. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1). ' )
shared . gradio [ ' exllama_info ' ] = gr . Markdown ( ' For more information, consult the [docs](https://github.com/oobabooga/text-generation-webui/blob/main/docs/ExLlama.md). ' )
shared . gradio [ ' exllama_HF_info ' ] = gr . Markdown ( ' ExLlama_HF is a wrapper that lets you use ExLlama like a Transformers model, which means it can use the Transformers samplers. It \' s a bit slower than the regular ExLlama. ' )
shared . gradio [ ' llamacpp_HF_info ' ] = gr . Markdown ( ' llamacpp_HF is a wrapper that lets you use llama.cpp like a Transformers model, which means it can use the Transformers samplers. To use it, make sure to first download oobabooga/llama-tokenizer under " Download custom model or LoRA " . ' )
with gr . Column ( ) :
with gr . Row ( ) :
shared . gradio [ ' autoload_model ' ] = gr . Checkbox ( value = shared . settings [ ' autoload_model ' ] , label = ' Autoload the model ' , info = ' Whether to load the model as soon as it is selected in the Model dropdown. ' )
shared . gradio [ ' custom_model_menu ' ] = gr . Textbox ( label = " Download custom model or LoRA " , info = " Enter the Hugging Face username/model path, for instance: facebook/galactica-125m. To specify a branch, add it at the end after a \" : \" character like this: facebook/galactica-125m:main " )
shared . gradio [ ' download_model_button ' ] = gr . Button ( " Download " )
with gr . Row ( ) :
shared . gradio [ ' model_status ' ] = gr . Markdown ( ' No model is loaded ' if shared . model_name == ' None ' else ' Ready ' )
def create_event_handlers ( ) :
shared . gradio [ ' loader ' ] . change ( loaders . make_loader_params_visible , gradio ( ' loader ' ) , gradio ( loaders . get_all_params ( ) ) )
# In this event handler, the interface state is read and updated
# with the model defaults (if any), and then the model is loaded
# unless "autoload_model" is unchecked
shared . gradio [ ' model_menu ' ] . change (
ui . gather_interface_values , gradio ( shared . input_elements ) , gradio ( ' interface_state ' ) ) . then (
apply_model_settings_to_state , gradio ( ' model_menu ' , ' interface_state ' ) , gradio ( ' interface_state ' ) ) . then (
ui . apply_interface_values , gradio ( ' interface_state ' ) , gradio ( ui . list_interface_input_elements ( ) ) , show_progress = False ) . then (
update_model_parameters , gradio ( ' interface_state ' ) , None ) . then (
load_model_wrapper , gradio ( ' model_menu ' , ' loader ' , ' autoload_model ' ) , gradio ( ' model_status ' ) , show_progress = False )
shared . gradio [ ' load_model ' ] . click (
ui . gather_interface_values , gradio ( shared . input_elements ) , gradio ( ' interface_state ' ) ) . then (
update_model_parameters , gradio ( ' interface_state ' ) , None ) . then (
partial ( load_model_wrapper , autoload = True ) , gradio ( ' model_menu ' , ' loader ' ) , gradio ( ' model_status ' ) , show_progress = False )
shared . gradio [ ' unload_model ' ] . click (
unload_model , None , None ) . then (
lambda : " Model unloaded " , None , gradio ( ' model_status ' ) )
shared . gradio [ ' reload_model ' ] . click (
unload_model , None , None ) . then (
ui . gather_interface_values , gradio ( shared . input_elements ) , gradio ( ' interface_state ' ) ) . then (
update_model_parameters , gradio ( ' interface_state ' ) , None ) . then (
partial ( load_model_wrapper , autoload = True ) , gradio ( ' model_menu ' , ' loader ' ) , gradio ( ' model_status ' ) , show_progress = False )
shared . gradio [ ' save_model_settings ' ] . click (
ui . gather_interface_values , gradio ( shared . input_elements ) , gradio ( ' interface_state ' ) ) . then (
save_model_settings , gradio ( ' model_menu ' , ' interface_state ' ) , gradio ( ' model_status ' ) , show_progress = False )
shared . gradio [ ' lora_menu_apply ' ] . click ( load_lora_wrapper , gradio ( ' lora_menu ' ) , gradio ( ' model_status ' ) , show_progress = False )
shared . gradio [ ' download_model_button ' ] . click ( download_model_wrapper , gradio ( ' custom_model_menu ' ) , gradio ( ' model_status ' ) , show_progress = True )
shared . gradio [ ' autoload_model ' ] . change ( lambda x : gr . update ( visible = not x ) , gradio ( ' autoload_model ' ) , gradio ( ' load_model ' ) )
def load_model_wrapper ( selected_model , loader , autoload = False ) :
if not autoload :
yield f " The settings for { selected_model } have been updated. \n Click on \" Load \" to load it. "
return
if selected_model == ' None ' :
yield " No model selected "
else :
try :
yield f " Loading { selected_model } ... "
shared . model_name = selected_model
unload_model ( )
if selected_model != ' ' :
shared . model , shared . tokenizer = load_model ( shared . model_name , loader )
if shared . model is not None :
yield f " Successfully loaded { selected_model } "
else :
yield f " Failed to load { selected_model } . "
except :
exc = traceback . format_exc ( )
logger . error ( ' Failed to load the model. ' )
print ( exc )
yield exc . replace ( ' \n ' , ' \n \n ' )
def load_lora_wrapper ( selected_loras ) :
yield ( " Applying the following LoRAs to {} : \n \n {} " . format ( shared . model_name , ' \n ' . join ( selected_loras ) ) )
add_lora_to_model ( selected_loras )
yield ( " Successfuly applied the LoRAs " )
def download_model_wrapper ( repo_id , progress = gr . Progress ( ) ) :
try :
downloader_module = importlib . import_module ( " download-model " )
downloader = downloader_module . ModelDownloader ( )
repo_id_parts = repo_id . split ( " : " )
model = repo_id_parts [ 0 ] if len ( repo_id_parts ) > 0 else repo_id
branch = repo_id_parts [ 1 ] if len ( repo_id_parts ) > 1 else " main "
check = False
progress ( 0.0 )
yield ( " Cleaning up the model/branch names " )
model , branch = downloader . sanitize_model_and_branch_names ( model , branch )
yield ( " Getting the download links from Hugging Face " )
links , sha256 , is_lora = downloader . get_download_links_from_huggingface ( model , branch , text_only = False )
yield ( " Getting the output folder " )
base_folder = shared . args . lora_dir if is_lora else shared . args . model_dir
output_folder = downloader . get_output_folder ( model , branch , is_lora , base_folder = base_folder )
if check :
progress ( 0.5 )
yield ( " Checking previously downloaded files " )
downloader . check_model_files ( model , branch , links , sha256 , output_folder )
progress ( 1.0 )
else :
yield ( f " Downloading files to { output_folder } " )
downloader . download_model_files ( model , branch , links , sha256 , output_folder , progress_bar = progress , threads = 1 )
yield ( " Done! " )
except :
progress ( 1.0 )
yield traceback . format_exc ( ) . replace ( ' \n ' , ' \n \n ' )