shared.gradio['n_gqa']=gr.Slider(minimum=0,maximum=16,step=1,label="n_gqa",value=shared.args.n_gqa,info='grouped-query attention. Must be 8 for llama-2 70b.')
shared.gradio['rms_norm_eps']=gr.Slider(minimum=0,maximum=1e-5,step=1e-6,label="rms_norm_eps",value=shared.args.rms_norm_eps,info='5e-6 is a good value for llama-2 models.')
shared.gradio['alpha_value']=gr.Slider(label='alpha_value',minimum=1,maximum=8,step=0.1,info='Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.',value=shared.args.alpha_value)
shared.gradio['compress_pos_emb']=gr.Slider(label='compress_pos_emb',minimum=1,maximum=8,step=1,info='Positional embeddings compression factor. Should typically be set to max_seq_len / 2048.',value=shared.args.compress_pos_emb)
shared.gradio['no_inject_fused_attention']=gr.Checkbox(label="no_inject_fused_attention",value=shared.args.no_inject_fused_attention,info='Disable fused attention. Fused attention improves inference performance but uses more VRAM. Disable if running low on VRAM.')
shared.gradio['no_inject_fused_mlp']=gr.Checkbox(label="no_inject_fused_mlp",value=shared.args.no_inject_fused_mlp,info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.')
shared.gradio['no_use_cuda_fp16']=gr.Checkbox(label="no_use_cuda_fp16",value=shared.args.no_use_cuda_fp16,info='This can make models faster on some systems.')
shared.gradio['desc_act']=gr.Checkbox(label="desc_act",value=shared.args.desc_act,info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.')
shared.gradio['disable_exllama']=gr.Checkbox(label="disable_exllama",value=shared.args.disable_exllama,info='Disable ExLlama kernel, which can improve inference speed on some systems.')
shared.gradio['tensor_split']=gr.Textbox(label='tensor_split',info='Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17')
shared.gradio['llama_cpp_seed']=gr.Number(label='Seed (0 for random)',value=shared.args.llama_cpp_seed)
shared.gradio['trust_remote_code']=gr.Checkbox(label="trust-remote-code",value=shared.args.trust_remote_code,info='Make sure to inspect the .py files inside the model folder before loading it with this option enabled.')
shared.gradio['gptq_for_llama_info']=gr.Markdown('GPTQ-for-LLaMa support is currently only kept for compatibility with older GPUs. AutoGPTQ or ExLlama is preferred when compatible. GPTQ-for-LLaMa is installed by default with the webui on supported systems. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1).')
shared.gradio['exllama_info']=gr.Markdown('For more information, consult the [docs](https://github.com/oobabooga/text-generation-webui/blob/main/docs/ExLlama.md).')
shared.gradio['exllama_HF_info']=gr.Markdown('ExLlama_HF is a wrapper that lets you use ExLlama like a Transformers model, which means it can use the Transformers samplers. It\'s a bit slower than the regular ExLlama.')
shared.gradio['llamacpp_HF_info']=gr.Markdown('llamacpp_HF is a wrapper that lets you use llama.cpp like a Transformers model, which means it can use the Transformers samplers. To use it, make sure to first download oobabooga/llama-tokenizer under "Download custom model or LoRA".')
withgr.Column():
withgr.Row():
shared.gradio['autoload_model']=gr.Checkbox(value=shared.settings['autoload_model'],label='Autoload the model',info='Whether to load the model as soon as it is selected in the Model dropdown.')
shared.gradio['custom_model_menu']=gr.Textbox(label="Download custom model or LoRA",info="Enter the Hugging Face username/model path, for instance: facebook/galactica-125m. To specify a branch, add it at the end after a \":\" character like this: facebook/galactica-125m:main")