text-generation-webui/modules/ui_model_menu.py

import importlib
import math
import re
import traceback
from functools import partial

import gradio as gr
import psutil
import torch

from modules import loaders, shared, ui, utils
from modules.logging_colors import logger
from modules.LoRA import add_lora_to_model
from modules.models import load_model, unload_model
from modules.models_settings import (
    apply_model_settings_to_state,
    save_model_settings,
    update_model_parameters
)
from modules.utils import gradio


def create_ui():
    # Finding the default values for the GPU and CPU memories
    total_mem = []
    for i in range(torch.cuda.device_count()):
        total_mem.append(math.floor(torch.cuda.get_device_properties(i).total_memory / (1024 * 1024)))

    default_gpu_mem = []
    if shared.args.gpu_memory is not None and len(shared.args.gpu_memory) > 0:
        for i in shared.args.gpu_memory:
            if 'mib' in i.lower():
                default_gpu_mem.append(int(re.sub('[a-zA-Z ]', '', i)))
            else:
                default_gpu_mem.append(int(re.sub('[a-zA-Z ]', '', i)) * 1000)

    while len(default_gpu_mem) < len(total_mem):
        default_gpu_mem.append(0)

    total_cpu_mem = math.floor(psutil.virtual_memory().total / (1024 * 1024))
    if shared.args.cpu_memory is not None:
        default_cpu_mem = re.sub('[a-zA-Z ]', '', shared.args.cpu_memory)
    else:
        default_cpu_mem = 0

    with gr.Tab("Model", elem_id="model-tab"):
        with gr.Row():
            with gr.Column():
                with gr.Row():
                    with gr.Column():
                        with gr.Row():
                            shared.gradio['model_menu'] = gr.Dropdown(choices=utils.get_available_models(), value=shared.model_name, label='Model', elem_classes='slim-dropdown')
                            ui.create_refresh_button(shared.gradio['model_menu'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button')
                            shared.gradio['load_model'] = gr.Button("Load", visible=not shared.settings['autoload_model'], elem_classes='refresh-button')
                            shared.gradio['unload_model'] = gr.Button("Unload", elem_classes='refresh-button')
                            shared.gradio['reload_model'] = gr.Button("Reload", elem_classes='refresh-button')
                            shared.gradio['save_model_settings'] = gr.Button("Save settings", elem_classes='refresh-button')

                    with gr.Column():
                        with gr.Row():
                            shared.gradio['lora_menu'] = gr.Dropdown(multiselect=True, choices=utils.get_available_loras(), value=shared.lora_names, label='LoRA(s)', elem_classes='slim-dropdown')
                            ui.create_refresh_button(shared.gradio['lora_menu'], lambda: None, lambda: {'choices': utils.get_available_loras(), 'value': shared.lora_names}, 'refresh-button')
                            shared.gradio['lora_menu_apply'] = gr.Button(value='Apply LoRAs', elem_classes='refresh-button')

        with gr.Row():
            with gr.Column():
                shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=loaders.loaders_and_params.keys(), value=None)
                with gr.Box():
                    with gr.Row():
                        with gr.Column():
                            for i in range(len(total_mem)):
                                shared.gradio[f'gpu_memory_{i}'] = gr.Slider(label=f"gpu-memory in MiB for device :{i}", maximum=total_mem[i], value=default_gpu_mem[i])

                            shared.gradio['cpu_memory'] = gr.Slider(label="cpu-memory in MiB", maximum=total_cpu_mem, value=default_cpu_mem)
                            shared.gradio['transformers_info'] = gr.Markdown('load-in-4bit params:')
                            shared.gradio['compute_dtype'] = gr.Dropdown(label="compute_dtype", choices=["bfloat16", "float16", "float32"], value=shared.args.compute_dtype)
                            shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type)

                            shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=128, value=shared.args.n_gpu_layers)
                            shared.gradio['n_ctx'] = gr.Slider(minimum=0, maximum=16384, step=256, label="n_ctx", value=shared.args.n_ctx)
                            shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=32, value=shared.args.threads)
                            shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, value=shared.args.n_batch)
                            shared.gradio['n_gqa'] = gr.Slider(minimum=0, maximum=16, step=1, label="n_gqa", value=shared.args.n_gqa, info='GGML only (not used by GGUF): Grouped-Query Attention. Must be 8 for llama-2 70b.')
                            shared.gradio['rms_norm_eps'] = gr.Slider(minimum=0, maximum=1e-5, step=1e-6, label="rms_norm_eps", value=shared.args.rms_norm_eps, info='GGML only (not used by GGUF): 5e-6 is a good value for llama-2 models.')

                            shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=str(shared.args.wbits) if shared.args.wbits > 0 else "None")
                            shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=str(shared.args.groupsize) if shared.args.groupsize > 0 else "None")
                            shared.gradio['model_type'] = gr.Dropdown(label="model_type", choices=["None"], value=shared.args.model_type or "None")
                            shared.gradio['pre_layer'] = gr.Slider(label="pre_layer", minimum=0, maximum=100, value=shared.args.pre_layer[0] if shared.args.pre_layer is not None else 0)
                            shared.gradio['autogptq_info'] = gr.Markdown('* ExLlama_HF is recommended over AutoGPTQ for models derived from LLaMA.')
                            shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
                            shared.gradio['max_seq_len'] = gr.Slider(label='max_seq_len', minimum=0, maximum=16384, step=256, info='Maximum sequence length.', value=shared.args.max_seq_len)
                            shared.gradio['alpha_value'] = gr.Slider(label='alpha_value', minimum=1, maximum=8, step=0.1, info='Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.', value=shared.args.alpha_value)
                            shared.gradio['rope_freq_base'] = gr.Slider(label='rope_freq_base', minimum=0, maximum=1000000, step=1000, info='If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63)', value=shared.args.rope_freq_base)
                            shared.gradio['compress_pos_emb'] = gr.Slider(label='compress_pos_emb', minimum=1, maximum=8, step=1, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.', value=shared.args.compress_pos_emb)

                        with gr.Column():
                            shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)
                            shared.gradio['no_inject_fused_attention'] = gr.Checkbox(label="no_inject_fused_attention", value=shared.args.no_inject_fused_attention, info='Disable fused attention. Fused attention improves inference performance but uses more VRAM. Disable if running low on VRAM.')
                            shared.gradio['no_inject_fused_mlp'] = gr.Checkbox(label="no_inject_fused_mlp", value=shared.args.no_inject_fused_mlp, info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.')
                            shared.gradio['no_use_cuda_fp16'] = gr.Checkbox(label="no_use_cuda_fp16", value=shared.args.no_use_cuda_fp16, info='This can make models faster on some systems.')
                            shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.')
                            shared.gradio['disable_exllama'] = gr.Checkbox(label="disable_exllama", value=shared.args.disable_exllama, info='Disable ExLlama kernel, which can improve inference speed on some systems.')
                            shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu)
                            shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
                            shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16)
                            shared.gradio['auto_devices'] = gr.Checkbox(label="auto-devices", value=shared.args.auto_devices)
                            shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)
                            shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
                            shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant)
                            shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap)
                            shared.gradio['low_vram'] = gr.Checkbox(label="low-vram", value=shared.args.low_vram)
                            shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
                            shared.gradio['mul_mat_q'] = gr.Checkbox(label="mul_mat_q", value=shared.args.mul_mat_q)
                            shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Create an additional cache for CFG negative prompts.')
                            shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17')
                            shared.gradio['llama_cpp_seed'] = gr.Number(label='Seed (0 for random)', value=shared.args.llama_cpp_seed)
                            shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Make sure to inspect the .py files inside the model folder before loading it with this option enabled.')
                            shared.gradio['gptq_for_llama_info'] = gr.Markdown('GPTQ-for-LLaMa support is currently only kept for compatibility with older GPUs. AutoGPTQ or ExLlama is preferred when compatible. GPTQ-for-LLaMa is installed by default with the webui on supported systems. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1).')
                            shared.gradio['exllama_info'] = gr.Markdown('For more information, consult the [docs](https://github.com/oobabooga/text-generation-webui/blob/main/docs/ExLlama.md).')
                            shared.gradio['exllama_HF_info'] = gr.Markdown('ExLlama_HF is a wrapper that lets you use ExLlama like a Transformers model, which means it can use the Transformers samplers. It\'s a bit slower than the regular ExLlama.')
                            shared.gradio['llamacpp_HF_info'] = gr.Markdown('llamacpp_HF is a wrapper that lets you use llama.cpp like a Transformers model, which means it can use the Transformers samplers. To use it, make sure to first download oobabooga/llama-tokenizer under "Download custom model or LoRA".')

            with gr.Column():
                with gr.Row():
                    shared.gradio['autoload_model'] = gr.Checkbox(value=shared.settings['autoload_model'], label='Autoload the model', info='Whether to load the model as soon as it is selected in the Model dropdown.')

                shared.gradio['custom_model_menu'] = gr.Textbox(label="Download custom model or LoRA", info="Enter the Hugging Face username/model path, for instance: facebook/galactica-125m. To specify a branch, add it at the end after a \":\" character like this: facebook/galactica-125m:main")
                shared.gradio['download_model_button'] = gr.Button("Download")

                with gr.Row():
                    shared.gradio['model_status'] = gr.Markdown('No model is loaded' if shared.model_name == 'None' else 'Ready')


def create_event_handlers():
    shared.gradio['loader'].change(
        loaders.make_loader_params_visible, gradio('loader'), gradio(loaders.get_all_params())).then(
        lambda value: gr.update(choices=loaders.get_model_types(value)), gradio('loader'), gradio('model_type'))

    # In this event handler, the interface state is read and updated
    # with the model defaults (if any), and then the model is loaded
    # unless "autoload_model" is unchecked
    shared.gradio['model_menu'].change(
        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
        apply_model_settings_to_state, gradio('model_menu', 'interface_state'), gradio('interface_state')).then(
        ui.apply_interface_values, gradio('interface_state'), gradio(ui.list_interface_input_elements()), show_progress=False).then(
        update_model_parameters, gradio('interface_state'), None).then(
        load_model_wrapper, gradio('model_menu', 'loader', 'autoload_model'), gradio('model_status'), show_progress=False).success(
        update_truncation_length, gradio('truncation_length', 'interface_state'), gradio('truncation_length'))

    shared.gradio['load_model'].click(
        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
        update_model_parameters, gradio('interface_state'), None).then(
        partial(load_model_wrapper, autoload=True), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=False).success(
        update_truncation_length, gradio('truncation_length', 'interface_state'), gradio('truncation_length'))

    shared.gradio['unload_model'].click(
        unload_model, None, None).then(
        lambda: "Model unloaded", None, gradio('model_status'))

    shared.gradio['reload_model'].click(
        unload_model, None, None).then(
        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
        update_model_parameters, gradio('interface_state'), None).then(
        partial(load_model_wrapper, autoload=True), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=False).success(
        update_truncation_length, gradio('truncation_length', 'interface_state'), gradio('truncation_length'))

    shared.gradio['save_model_settings'].click(
        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
        save_model_settings, gradio('model_menu', 'interface_state'), gradio('model_status'), show_progress=False)

    shared.gradio['lora_menu_apply'].click(load_lora_wrapper, gradio('lora_menu'), gradio('model_status'), show_progress=False)
    shared.gradio['download_model_button'].click(download_model_wrapper, gradio('custom_model_menu'), gradio('model_status'), show_progress=True)
    shared.gradio['autoload_model'].change(lambda x: gr.update(visible=not x), gradio('autoload_model'), gradio('load_model'))


def load_model_wrapper(selected_model, loader, autoload=False):
    if not autoload:
        yield f"The settings for {selected_model} have been updated.\nClick on \"Load\" to load it."
        return

    if selected_model == 'None':
        yield "No model selected"
    else:
        try:
            yield f"Loading {selected_model}..."
            shared.model_name = selected_model
            unload_model()
            if selected_model != '':
                shared.model, shared.tokenizer = load_model(shared.model_name, loader)

            if shared.model is not None:
                yield f"Successfully loaded {selected_model}"
            else:
                yield f"Failed to load {selected_model}."
        except:
            exc = traceback.format_exc()
            logger.error('Failed to load the model.')
            print(exc)
            yield exc.replace('\n', '\n\n')


def load_lora_wrapper(selected_loras):
    yield ("Applying the following LoRAs to {}:\n\n{}".format(shared.model_name, '\n'.join(selected_loras)))
    add_lora_to_model(selected_loras)
    yield ("Successfuly applied the LoRAs")


def download_model_wrapper(repo_id, progress=gr.Progress()):
    try:
        downloader_module = importlib.import_module("download-model")
        downloader = downloader_module.ModelDownloader()
        repo_id_parts = repo_id.split(":")
        model = repo_id_parts[0] if len(repo_id_parts) > 0 else repo_id
        branch = repo_id_parts[1] if len(repo_id_parts) > 1 else "main"
        check = False

        progress(0.0)
        yield ("Cleaning up the model/branch names")
        model, branch = downloader.sanitize_model_and_branch_names(model, branch)

        yield ("Getting the download links from Hugging Face")
        links, sha256, is_lora = downloader.get_download_links_from_huggingface(model, branch, text_only=False)

        yield ("Getting the output folder")
        base_folder = shared.args.lora_dir if is_lora else shared.args.model_dir
        output_folder = downloader.get_output_folder(model, branch, is_lora, base_folder=base_folder)

        if check:
            progress(0.5)
            yield ("Checking previously downloaded files")
            downloader.check_model_files(model, branch, links, sha256, output_folder)
            progress(1.0)
        else:
            yield (f"Downloading files to {output_folder}")
            downloader.download_model_files(model, branch, links, sha256, output_folder, progress_bar=progress, threads=1)
            yield ("Done!")
    except:
        progress(1.0)
        yield traceback.format_exc().replace('\n', '\n\n')


def update_truncation_length(current_length, state):
    if state['loader'] in ['ExLlama', 'ExLlama_HF']:
        return state['max_seq_len']
    elif state['loader'] in ['llama.cpp', 'llamacpp_HF', 'ctransformers']:
        return state['n_ctx']
    else:
        return current_length
Refactor everything (#3481) 2023-08-07 02:49:27 +02:00			`import importlib`
			`import math`
			`import re`
			`import traceback`
			`from functools import partial`

			`import gradio as gr`
			`import psutil`
			`import torch`

			`from modules import loaders, shared, ui, utils`
			`from modules.logging_colors import logger`
			`from modules.LoRA import add_lora_to_model`
			`from modules.models import load_model, unload_model`
			`from modules.models_settings import (`
			`apply_model_settings_to_state,`
			`save_model_settings,`
			`update_model_parameters`
			`)`
			`from modules.utils import gradio`


			`def create_ui():`
			`# Finding the default values for the GPU and CPU memories`
			`total_mem = []`
			`for i in range(torch.cuda.device_count()):`
			`total_mem.append(math.floor(torch.cuda.get_device_properties(i).total_memory / (1024 * 1024)))`

			`default_gpu_mem = []`
			`if shared.args.gpu_memory is not None and len(shared.args.gpu_memory) > 0:`
			`for i in shared.args.gpu_memory:`
			`if 'mib' in i.lower():`
			`default_gpu_mem.append(int(re.sub('[a-zA-Z ]', '', i)))`
			`else:`
			`default_gpu_mem.append(int(re.sub('[a-zA-Z ]', '', i)) * 1000)`
Bump llama-cpp-python, +tensor_split by @shouyiwang, +mul_mat_q (#3610) 2023-08-18 17:03:34 +02:00
Refactor everything (#3481) 2023-08-07 02:49:27 +02:00			`while len(default_gpu_mem) < len(total_mem):`
			`default_gpu_mem.append(0)`

			`total_cpu_mem = math.floor(psutil.virtual_memory().total / (1024 * 1024))`
			`if shared.args.cpu_memory is not None:`
			`default_cpu_mem = re.sub('[a-zA-Z ]', '', shared.args.cpu_memory)`
			`else:`
			`default_cpu_mem = 0`

			`with gr.Tab("Model", elem_id="model-tab"):`
			`with gr.Row():`
			`with gr.Column():`
			`with gr.Row():`
			`with gr.Column():`
			`with gr.Row():`
			`shared.gradio['model_menu'] = gr.Dropdown(choices=utils.get_available_models(), value=shared.model_name, label='Model', elem_classes='slim-dropdown')`
			`ui.create_refresh_button(shared.gradio['model_menu'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button')`
			`shared.gradio['load_model'] = gr.Button("Load", visible=not shared.settings['autoload_model'], elem_classes='refresh-button')`
			`shared.gradio['unload_model'] = gr.Button("Unload", elem_classes='refresh-button')`
			`shared.gradio['reload_model'] = gr.Button("Reload", elem_classes='refresh-button')`
			`shared.gradio['save_model_settings'] = gr.Button("Save settings", elem_classes='refresh-button')`

			`with gr.Column():`
			`with gr.Row():`
			`shared.gradio['lora_menu'] = gr.Dropdown(multiselect=True, choices=utils.get_available_loras(), value=shared.lora_names, label='LoRA(s)', elem_classes='slim-dropdown')`
			`ui.create_refresh_button(shared.gradio['lora_menu'], lambda: None, lambda: {'choices': utils.get_available_loras(), 'value': shared.lora_names}, 'refresh-button')`
			`shared.gradio['lora_menu_apply'] = gr.Button(value='Apply LoRAs', elem_classes='refresh-button')`

			`with gr.Row():`
			`with gr.Column():`
Add ctransformers support (#3313) --------- Co-authored-by: cal066 <cal066@users.noreply.github.com> Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com> Co-authored-by: randoentity <137087500+randoentity@users.noreply.github.com> 2023-08-11 19:41:33 +02:00			`shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=loaders.loaders_and_params.keys(), value=None)`
Refactor everything (#3481) 2023-08-07 02:49:27 +02:00			`with gr.Box():`
			`with gr.Row():`
			`with gr.Column():`
			`for i in range(len(total_mem)):`
			`shared.gradio[f'gpu_memory_{i}'] = gr.Slider(label=f"gpu-memory in MiB for device :{i}", maximum=total_mem[i], value=default_gpu_mem[i])`

			`shared.gradio['cpu_memory'] = gr.Slider(label="cpu-memory in MiB", maximum=total_cpu_mem, value=default_cpu_mem)`
			`shared.gradio['transformers_info'] = gr.Markdown('load-in-4bit params:')`
			`shared.gradio['compute_dtype'] = gr.Dropdown(label="compute_dtype", choices=["bfloat16", "float16", "float32"], value=shared.args.compute_dtype)`
			`shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type)`

			`shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=128, value=shared.args.n_gpu_layers)`
			`shared.gradio['n_ctx'] = gr.Slider(minimum=0, maximum=16384, step=256, label="n_ctx", value=shared.args.n_ctx)`
			`shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=32, value=shared.args.threads)`
			`shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, value=shared.args.n_batch)`
Change some comments 2023-08-27 07:24:44 +02:00			`shared.gradio['n_gqa'] = gr.Slider(minimum=0, maximum=16, step=1, label="n_gqa", value=shared.args.n_gqa, info='GGML only (not used by GGUF): Grouped-Query Attention. Must be 8 for llama-2 70b.')`
			`shared.gradio['rms_norm_eps'] = gr.Slider(minimum=0, maximum=1e-5, step=1e-6, label="rms_norm_eps", value=shared.args.rms_norm_eps, info='GGML only (not used by GGUF): 5e-6 is a good value for llama-2 models.')`
Refactor everything (#3481) 2023-08-07 02:49:27 +02:00
			`shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=str(shared.args.wbits) if shared.args.wbits > 0 else "None")`
			`shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=str(shared.args.groupsize) if shared.args.groupsize > 0 else "None")`
Add ctransformers support (#3313) --------- Co-authored-by: cal066 <cal066@users.noreply.github.com> Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com> Co-authored-by: randoentity <137087500+randoentity@users.noreply.github.com> 2023-08-11 19:41:33 +02:00			`shared.gradio['model_type'] = gr.Dropdown(label="model_type", choices=["None"], value=shared.args.model_type or "None")`
Refactor everything (#3481) 2023-08-07 02:49:27 +02:00			`shared.gradio['pre_layer'] = gr.Slider(label="pre_layer", minimum=0, maximum=100, value=shared.args.pre_layer[0] if shared.args.pre_layer is not None else 0)`
			`shared.gradio['autogptq_info'] = gr.Markdown('* ExLlama_HF is recommended over AutoGPTQ for models derived from LLaMA.')`
			`shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')`
			`shared.gradio['max_seq_len'] = gr.Slider(label='max_seq_len', minimum=0, maximum=16384, step=256, info='Maximum sequence length.', value=shared.args.max_seq_len)`
Add RoPE scaling support for transformers (including dynamic NTK) https://github.com/huggingface/transformers/pull/24653 2023-08-09 06:24:28 +02:00			`shared.gradio['alpha_value'] = gr.Slider(label='alpha_value', minimum=1, maximum=8, step=0.1, info='Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.', value=shared.args.alpha_value)`
Detect CodeLlama settings 2023-08-25 16:06:57 +02:00			`shared.gradio['rope_freq_base'] = gr.Slider(label='rope_freq_base', minimum=0, maximum=1000000, step=1000, info='If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63)', value=shared.args.rope_freq_base)`
Add rope_freq_base parameter for CodeLlama 2023-08-25 15:53:37 +02:00			`shared.gradio['compress_pos_emb'] = gr.Slider(label='compress_pos_emb', minimum=1, maximum=8, step=1, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.', value=shared.args.compress_pos_emb)`
Refactor everything (#3481) 2023-08-07 02:49:27 +02:00
			`with gr.Column():`
			`shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)`
			`shared.gradio['no_inject_fused_attention'] = gr.Checkbox(label="no_inject_fused_attention", value=shared.args.no_inject_fused_attention, info='Disable fused attention. Fused attention improves inference performance but uses more VRAM. Disable if running low on VRAM.')`
			`shared.gradio['no_inject_fused_mlp'] = gr.Checkbox(label="no_inject_fused_mlp", value=shared.args.no_inject_fused_mlp, info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.')`
			`shared.gradio['no_use_cuda_fp16'] = gr.Checkbox(label="no_use_cuda_fp16", value=shared.args.no_use_cuda_fp16, info='This can make models faster on some systems.')`
			`shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.')`
Add the --disable_exllama option for AutoGPTQ 2023-08-12 08:26:58 +02:00			`shared.gradio['disable_exllama'] = gr.Checkbox(label="disable_exllama", value=shared.args.disable_exllama, info='Disable ExLlama kernel, which can improve inference speed on some systems.')`
Refactor everything (#3481) 2023-08-07 02:49:27 +02:00			`shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu)`
			`shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)`
			`shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16)`
			`shared.gradio['auto_devices'] = gr.Checkbox(label="auto-devices", value=shared.args.auto_devices)`
			`shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)`
			`shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)`
			`shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant)`
			`shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap)`
			`shared.gradio['low_vram'] = gr.Checkbox(label="low-vram", value=shared.args.low_vram)`
			`shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)`
Bump llama-cpp-python, +tensor_split by @shouyiwang, +mul_mat_q (#3610) 2023-08-18 17:03:34 +02:00			`shared.gradio['mul_mat_q'] = gr.Checkbox(label="mul_mat_q", value=shared.args.mul_mat_q)`
Implement CFG for ExLlama_HF (#3666) 2023-08-24 21:27:36 +02:00			`shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Create an additional cache for CFG negative prompts.')`
Bump llama-cpp-python, +tensor_split by @shouyiwang, +mul_mat_q (#3610) 2023-08-18 17:03:34 +02:00			`shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17')`
Refactor everything (#3481) 2023-08-07 02:49:27 +02:00			`shared.gradio['llama_cpp_seed'] = gr.Number(label='Seed (0 for random)', value=shared.args.llama_cpp_seed)`
			`shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Make sure to inspect the .py files inside the model folder before loading it with this option enabled.')`
Update installation documentation 2023-08-10 07:53:48 +02:00			`shared.gradio['gptq_for_llama_info'] = gr.Markdown('GPTQ-for-LLaMa support is currently only kept for compatibility with older GPUs. AutoGPTQ or ExLlama is preferred when compatible. GPTQ-for-LLaMa is installed by default with the webui on supported systems. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1).')`
Refactor everything (#3481) 2023-08-07 02:49:27 +02:00			`shared.gradio['exllama_info'] = gr.Markdown('For more information, consult the [docs](https://github.com/oobabooga/text-generation-webui/blob/main/docs/ExLlama.md).')`
			`shared.gradio['exllama_HF_info'] = gr.Markdown('ExLlama_HF is a wrapper that lets you use ExLlama like a Transformers model, which means it can use the Transformers samplers. It\'s a bit slower than the regular ExLlama.')`
			`shared.gradio['llamacpp_HF_info'] = gr.Markdown('llamacpp_HF is a wrapper that lets you use llama.cpp like a Transformers model, which means it can use the Transformers samplers. To use it, make sure to first download oobabooga/llama-tokenizer under "Download custom model or LoRA".')`

			`with gr.Column():`
			`with gr.Row():`
			`shared.gradio['autoload_model'] = gr.Checkbox(value=shared.settings['autoload_model'], label='Autoload the model', info='Whether to load the model as soon as it is selected in the Model dropdown.')`

			`shared.gradio['custom_model_menu'] = gr.Textbox(label="Download custom model or LoRA", info="Enter the Hugging Face username/model path, for instance: facebook/galactica-125m. To specify a branch, add it at the end after a \":\" character like this: facebook/galactica-125m:main")`
			`shared.gradio['download_model_button'] = gr.Button("Download")`

			`with gr.Row():`
			`shared.gradio['model_status'] = gr.Markdown('No model is loaded' if shared.model_name == 'None' else 'Ready')`


			`def create_event_handlers():`
Add ctransformers support (#3313) --------- Co-authored-by: cal066 <cal066@users.noreply.github.com> Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com> Co-authored-by: randoentity <137087500+randoentity@users.noreply.github.com> 2023-08-11 19:41:33 +02:00			`shared.gradio['loader'].change(`
			`loaders.make_loader_params_visible, gradio('loader'), gradio(loaders.get_all_params())).then(`
			`lambda value: gr.update(choices=loaders.get_model_types(value)), gradio('loader'), gradio('model_type'))`
Refactor everything (#3481) 2023-08-07 02:49:27 +02:00
			`# In this event handler, the interface state is read and updated`
			`# with the model defaults (if any), and then the model is loaded`
			`# unless "autoload_model" is unchecked`
			`shared.gradio['model_menu'].change(`
			`ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(`
			`apply_model_settings_to_state, gradio('model_menu', 'interface_state'), gradio('interface_state')).then(`
			`ui.apply_interface_values, gradio('interface_state'), gradio(ui.list_interface_input_elements()), show_progress=False).then(`
			`update_model_parameters, gradio('interface_state'), None).then(`
Update truncation length based on max_seq_len/n_ctx 2023-08-27 08:10:45 +02:00			`load_model_wrapper, gradio('model_menu', 'loader', 'autoload_model'), gradio('model_status'), show_progress=False).success(`
			`update_truncation_length, gradio('truncation_length', 'interface_state'), gradio('truncation_length'))`
Refactor everything (#3481) 2023-08-07 02:49:27 +02:00
			`shared.gradio['load_model'].click(`
			`ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(`
			`update_model_parameters, gradio('interface_state'), None).then(`
Update truncation length based on max_seq_len/n_ctx 2023-08-27 08:10:45 +02:00			`partial(load_model_wrapper, autoload=True), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=False).success(`
			`update_truncation_length, gradio('truncation_length', 'interface_state'), gradio('truncation_length'))`
Refactor everything (#3481) 2023-08-07 02:49:27 +02:00
			`shared.gradio['unload_model'].click(`
			`unload_model, None, None).then(`
			`lambda: "Model unloaded", None, gradio('model_status'))`

			`shared.gradio['reload_model'].click(`
			`unload_model, None, None).then(`
			`ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(`
			`update_model_parameters, gradio('interface_state'), None).then(`
Update truncation length based on max_seq_len/n_ctx 2023-08-27 08:10:45 +02:00			`partial(load_model_wrapper, autoload=True), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=False).success(`
			`update_truncation_length, gradio('truncation_length', 'interface_state'), gradio('truncation_length'))`
Refactor everything (#3481) 2023-08-07 02:49:27 +02:00
			`shared.gradio['save_model_settings'].click(`
			`ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(`
			`save_model_settings, gradio('model_menu', 'interface_state'), gradio('model_status'), show_progress=False)`

			`shared.gradio['lora_menu_apply'].click(load_lora_wrapper, gradio('lora_menu'), gradio('model_status'), show_progress=False)`
			`shared.gradio['download_model_button'].click(download_model_wrapper, gradio('custom_model_menu'), gradio('model_status'), show_progress=True)`
			`shared.gradio['autoload_model'].change(lambda x: gr.update(visible=not x), gradio('autoload_model'), gradio('load_model'))`


			`def load_model_wrapper(selected_model, loader, autoload=False):`
			`if not autoload:`
			`yield f"The settings for {selected_model} have been updated.\nClick on \"Load\" to load it."`
			`return`

			`if selected_model == 'None':`
			`yield "No model selected"`
			`else:`
			`try:`
			`yield f"Loading {selected_model}..."`
			`shared.model_name = selected_model`
			`unload_model()`
			`if selected_model != '':`
			`shared.model, shared.tokenizer = load_model(shared.model_name, loader)`

			`if shared.model is not None:`
			`yield f"Successfully loaded {selected_model}"`
			`else:`
			`yield f"Failed to load {selected_model}."`
			`except:`
			`exc = traceback.format_exc()`
			`logger.error('Failed to load the model.')`
			`print(exc)`
			`yield exc.replace('\n', '\n\n')`


			`def load_lora_wrapper(selected_loras):`
			`yield ("Applying the following LoRAs to {}:\n\n{}".format(shared.model_name, '\n'.join(selected_loras)))`
			`add_lora_to_model(selected_loras)`
			`yield ("Successfuly applied the LoRAs")`


			`def download_model_wrapper(repo_id, progress=gr.Progress()):`
			`try:`
			`downloader_module = importlib.import_module("download-model")`
			`downloader = downloader_module.ModelDownloader()`
			`repo_id_parts = repo_id.split(":")`
			`model = repo_id_parts[0] if len(repo_id_parts) > 0 else repo_id`
			`branch = repo_id_parts[1] if len(repo_id_parts) > 1 else "main"`
			`check = False`

			`progress(0.0)`
			`yield ("Cleaning up the model/branch names")`
			`model, branch = downloader.sanitize_model_and_branch_names(model, branch)`

			`yield ("Getting the download links from Hugging Face")`
			`links, sha256, is_lora = downloader.get_download_links_from_huggingface(model, branch, text_only=False)`

			`yield ("Getting the output folder")`
			`base_folder = shared.args.lora_dir if is_lora else shared.args.model_dir`
			`output_folder = downloader.get_output_folder(model, branch, is_lora, base_folder=base_folder)`

			`if check:`
			`progress(0.5)`
			`yield ("Checking previously downloaded files")`
			`downloader.check_model_files(model, branch, links, sha256, output_folder)`
			`progress(1.0)`
			`else:`
			`yield (f"Downloading files to {output_folder}")`
			`downloader.download_model_files(model, branch, links, sha256, output_folder, progress_bar=progress, threads=1)`
			`yield ("Done!")`
			`except:`
			`progress(1.0)`
			`yield traceback.format_exc().replace('\n', '\n\n')`
Update truncation length based on max_seq_len/n_ctx 2023-08-27 08:10:45 +02:00

			`def update_truncation_length(current_length, state):`
			`if state['loader'] in ['ExLlama', 'ExLlama_HF']:`
			`return state['max_seq_len']`
			`elif state['loader'] in ['llama.cpp', 'llamacpp_HF', 'ctransformers']:`
			`return state['n_ctx']`
			`else:`
			`return current_length`