shared.gradio['n_gpu_layers']=gr.Slider(label="n-gpu-layers",minimum=0,maximum=256,value=shared.args.n_gpu_layers,info='Must be set to more than 0 for your GPU to be used.')
shared.gradio['n_ctx']=gr.Slider(minimum=0,maximum=shared.settings['truncation_length_max'],step=256,label="n_ctx",value=shared.args.n_ctx,info='Context length. Try lowering this if you run out of memory while loading the model.')
shared.gradio['max_seq_len']=gr.Slider(label='max_seq_len',minimum=0,maximum=shared.settings['truncation_length_max'],step=256,info='Context length. Try lowering this if you run out of memory while loading the model.',value=shared.args.max_seq_len)
shared.gradio['alpha_value']=gr.Number(label='alpha_value',value=shared.args.alpha_value,precision=2,info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.')
shared.gradio['rope_freq_base']=gr.Number(label='rope_freq_base',value=shared.args.rope_freq_base,precision=0,info='Positional embeddings frequency base for NTK RoPE scaling. Related to alpha_value by rope_freq_base = 10000 * alpha_value ^ (64 / 63). 0 = from model.')
shared.gradio['compress_pos_emb']=gr.Number(label='compress_pos_emb',value=shared.args.compress_pos_emb,precision=0,info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.')
shared.gradio['use_flash_attention_2']=gr.Checkbox(label="use_flash_attention_2",value=shared.args.use_flash_attention_2,info='Set use_flash_attention_2=True while loading the model.')
shared.gradio['use_eager_attention']=gr.Checkbox(label="use_eager_attention",value=shared.args.use_eager_attention,info='Set attn_implementation= eager while loading the model.')
shared.gradio['streaming_llm']=gr.Checkbox(label="streaming_llm",value=shared.args.streaming_llm,info='(experimental) Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
shared.gradio['attention_sink_size']=gr.Number(label="attention_sink_size",value=shared.args.attention_sink_size,precision=0,info='StreamingLLM: number of sink tokens. Only used if the trimmed prompt doesn\'t share a prefix with the old prompt.')
shared.gradio['cpu']=gr.Checkbox(label="cpu",value=shared.args.cpu,info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.')
shared.gradio['row_split']=gr.Checkbox(label="row_split",value=shared.args.row_split,info='Split the model by rows across GPUs. This may improve multi-gpu performance.')
shared.gradio['no_offload_kqv']=gr.Checkbox(label="no_offload_kqv",value=shared.args.no_offload_kqv,info='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.')
shared.gradio['no_inject_fused_attention']=gr.Checkbox(label="no_inject_fused_attention",value=shared.args.no_inject_fused_attention,info='Disable fused attention. Fused attention improves inference performance but uses more VRAM. Fuses layers for AutoAWQ. Disable if running low on VRAM.')
shared.gradio['no_inject_fused_mlp']=gr.Checkbox(label="no_inject_fused_mlp",value=shared.args.no_inject_fused_mlp,info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.')
shared.gradio['no_use_cuda_fp16']=gr.Checkbox(label="no_use_cuda_fp16",value=shared.args.no_use_cuda_fp16,info='This can make models faster on some systems.')
shared.gradio['desc_act']=gr.Checkbox(label="desc_act",value=shared.args.desc_act,info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.')
shared.gradio['autosplit']=gr.Checkbox(label="autosplit",value=shared.args.autosplit,info='Automatically split the model tensors across the available GPUs.')
shared.gradio['cpp_runner']=gr.Checkbox(label="cpp-runner",value=shared.args.cpp_runner,info='Enable inference with ModelRunnerCpp, which is faster than the default ModelRunner.')
shared.gradio['num_experts_per_token']=gr.Number(label="Number of experts per token",value=shared.args.num_experts_per_token,info='Only applies to MoE models like Mixtral.')
shared.gradio['trust_remote_code']=gr.Checkbox(label="trust-remote-code",value=shared.args.trust_remote_code,info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.',interactive=shared.args.trust_remote_code)
shared.gradio['no_use_fast']=gr.Checkbox(label="no_use_fast",value=shared.args.no_use_fast,info='Set use_fast=False while loading the tokenizer.')
shared.gradio['logits_all']=gr.Checkbox(label="logits_all",value=shared.args.logits_all,info='Needs to be set for perplexity evaluation to work with this loader. Otherwise, ignore it, as it makes prompt processing slower.')
shared.gradio['disable_exllama']=gr.Checkbox(label="disable_exllama",value=shared.args.disable_exllama,info='Disable ExLlama kernel for GPTQ models.')
shared.gradio['disable_exllamav2']=gr.Checkbox(label="disable_exllamav2",value=shared.args.disable_exllamav2,info='Disable ExLlamav2 kernel for GPTQ models.')
shared.gradio['exllamav2_info']=gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.")
shared.gradio['llamacpp_HF_info']=gr.Markdown("llamacpp_HF loads llama.cpp as a Transformers model. To use it, you need to place your GGUF in a subfolder of models/ with the necessary tokenizer files.\n\nYou can use the \"llamacpp_HF creator\" menu to do that automatically.")
shared.gradio['tensorrt_llm_info']=gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `max_seq_len` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.')
shared.gradio['autoload_model']=gr.Checkbox(value=shared.settings['autoload_model'],label='Autoload the model',info='Whether to load the model as soon as it is selected in the Model dropdown.',interactive=notmu)
shared.gradio['custom_model_menu']=gr.Textbox(label="Download model or LoRA",info="Enter the Hugging Face username/model path, for instance: facebook/galactica-125m. To specify a branch, add it at the end after a \":\" character like this: facebook/galactica-125m:main. To download a single file, enter its name in the second box.",interactive=notmu)
shared.gradio['download_specific_file']=gr.Textbox(placeholder="File name (for GGUF models)",show_label=False,max_lines=1,interactive=notmu)
shared.gradio['gguf_menu']=gr.Dropdown(choices=utils.get_available_ggufs(),value=lambda:shared.model_name,label='Choose your GGUF',elem_classes='slim-dropdown',interactive=notmu)
shared.gradio['unquantized_url']=gr.Textbox(label="Enter the URL for the original (unquantized) model",info="Example: https://huggingface.co/lmsys/vicuna-13b-v1.5",max_lines=1)
shared.gradio['customized_template']=gr.Dropdown(choices=utils.get_available_instruction_templates(),value='None',label='Select the desired instruction template',elem_classes='slim-dropdown')
gr.Markdown("This allows you to set a customized template for the model currently selected in the \"Model loader\" menu. Whenever the model gets loaded, this template will be used in place of the template specified in the model's medatada, which sometimes is wrong.")
output+='\n\nIt seems to be an instruction-following model with template "{}". In the chat tab, instruct or chat-instruct modes should be used.'.format(settings['instruction_template'])