shared.gradio['alpha_value']=gr.Slider(label='alpha_value',minimum=1,maximum=8,step=0.05,info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.',value=shared.args.alpha_value)
shared.gradio['rope_freq_base']=gr.Slider(label='rope_freq_base',minimum=0,maximum=1000000,step=1000,info='If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63)',value=shared.args.rope_freq_base)
shared.gradio['compress_pos_emb']=gr.Slider(label='compress_pos_emb',minimum=1,maximum=8,step=1,info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.',value=shared.args.compress_pos_emb)
shared.gradio['no_inject_fused_attention']=gr.Checkbox(label="no_inject_fused_attention",value=shared.args.no_inject_fused_attention,info='Disable fused attention. Fused attention improves inference performance but uses more VRAM. Fuses layers for AutoAWQ. Disable if running low on VRAM.')
shared.gradio['no_inject_fused_mlp']=gr.Checkbox(label="no_inject_fused_mlp",value=shared.args.no_inject_fused_mlp,info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.')
shared.gradio['no_use_cuda_fp16']=gr.Checkbox(label="no_use_cuda_fp16",value=shared.args.no_use_cuda_fp16,info='This can make models faster on some systems.')
shared.gradio['desc_act']=gr.Checkbox(label="desc_act",value=shared.args.desc_act,info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.')
shared.gradio['mul_mat_q']=gr.Checkbox(label="mul_mat_q",value=shared.args.mul_mat_q,info='Recommended in most cases. Improves generation speed by 10-20%.')
shared.gradio['cfg_cache']=gr.Checkbox(label="cfg-cache",value=shared.args.cfg_cache,info='Create an additional cache for CFG negative prompts.')
shared.gradio['tensor_split']=gr.Textbox(label='tensor_split',info='Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17')
shared.gradio['llama_cpp_seed']=gr.Number(label='Seed (0 for random)',value=shared.args.llama_cpp_seed)
shared.gradio['trust_remote_code']=gr.Checkbox(label="trust-remote-code",value=shared.args.trust_remote_code,info='Make sure to inspect the .py files inside the model folder before loading it with this option enabled.')
shared.gradio['use_fast']=gr.Checkbox(label="use_fast",value=shared.args.use_fast,info='Set use_fast=True while loading the tokenizer. May trigger a conversion that takes several minutes.')
shared.gradio['gptq_for_llama_info']=gr.Markdown('GPTQ-for-LLaMa support is currently only kept for compatibility with older GPUs. AutoGPTQ or ExLlama is preferred when compatible. GPTQ-for-LLaMa is installed by default with the webui on supported systems. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1).')
shared.gradio['exllama_info']=gr.Markdown('For more information, consult the [docs](https://github.com/oobabooga/text-generation-webui/blob/main/docs/ExLlama.md).')
shared.gradio['exllama_HF_info']=gr.Markdown('ExLlama_HF is a wrapper that lets you use ExLlama like a Transformers model, which means it can use the Transformers samplers. It\'s a bit slower than the regular ExLlama.')
shared.gradio['llamacpp_HF_info']=gr.Markdown('llamacpp_HF loads llama.cpp as a Transformers model. To use it, you need to download a tokenizer.\n\nOption 1: download `oobabooga/llama-tokenizer` under "Download model or LoRA". That\'s a default Llama tokenizer.\n\nOption 2: place your .gguf in a subfolder of models/ along with these 3 files: tokenizer.model, tokenizer_config.json, and special_tokens_map.json. This takes precedence over Option 1.')
shared.gradio['autoload_model']=gr.Checkbox(value=shared.settings['autoload_model'],label='Autoload the model',info='Whether to load the model as soon as it is selected in the Model dropdown.',interactive=notmu)
shared.gradio['custom_model_menu']=gr.Textbox(label="Download model or LoRA",info="Enter the Hugging Face username/model path, for instance: facebook/galactica-125m. To specify a branch, add it at the end after a \":\" character like this: facebook/galactica-125m:main. To download a single file, enter its name in the second box.",interactive=notmu)
shared.gradio['download_specific_file']=gr.Textbox(placeholder="File name (for GGUF models)",show_label=False,max_lines=1,interactive=notmu)
output+='\n\nIt seems to be an instruction-following model with template "{}". In the chat tab, instruct or chat-instruct modes should be used.'.format(settings['instruction_template'])