shared.gradio['n_ctx']=gr.Slider(minimum=0,maximum=shared.settings['truncation_length_max'],step=256,label="n_ctx",value=shared.args.n_ctx,info='Context length. Try lowering this if you run out of memory while loading the model.')
shared.gradio['max_seq_len']=gr.Slider(label='max_seq_len',minimum=0,maximum=shared.settings['truncation_length_max'],step=256,info='Context length. Try lowering this if you run out of memory while loading the model.',value=shared.args.max_seq_len)
shared.gradio['alpha_value']=gr.Slider(label='alpha_value',minimum=1,maximum=8,step=0.05,info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.',value=shared.args.alpha_value)
shared.gradio['rope_freq_base']=gr.Slider(label='rope_freq_base',minimum=0,maximum=1000000,step=1000,info='If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63)',value=shared.args.rope_freq_base)
shared.gradio['compress_pos_emb']=gr.Slider(label='compress_pos_emb',minimum=1,maximum=8,step=1,info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.',value=shared.args.compress_pos_emb)
shared.gradio['no_inject_fused_attention']=gr.Checkbox(label="no_inject_fused_attention",value=shared.args.no_inject_fused_attention,info='Disable fused attention. Fused attention improves inference performance but uses more VRAM. Fuses layers for AutoAWQ. Disable if running low on VRAM.')
shared.gradio['no_inject_fused_mlp']=gr.Checkbox(label="no_inject_fused_mlp",value=shared.args.no_inject_fused_mlp,info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.')
shared.gradio['no_use_cuda_fp16']=gr.Checkbox(label="no_use_cuda_fp16",value=shared.args.no_use_cuda_fp16,info='This can make models faster on some systems.')
shared.gradio['desc_act']=gr.Checkbox(label="desc_act",value=shared.args.desc_act,info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.')
shared.gradio['tensor_split']=gr.Textbox(label='tensor_split',info='Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17')
shared.gradio['trust_remote_code']=gr.Checkbox(label="trust-remote-code",value=shared.args.trust_remote_code,info='To enable this option, start the web UI with the --trust-remote-code flag. It is necessary for some models.',interactive=shared.args.trust_remote_code)
shared.gradio['logits_all']=gr.Checkbox(label="logits_all",value=shared.args.logits_all,info='Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower.')
shared.gradio['use_flash_attention_2']=gr.Checkbox(label="use_flash_attention_2",value=shared.args.use_flash_attention_2,info='Set use_flash_attention_2=True while loading the model.')
shared.gradio['gptq_for_llama_info']=gr.Markdown('GPTQ-for-LLaMa support is currently only kept for compatibility with older GPUs. AutoGPTQ or ExLlama is preferred when compatible. GPTQ-for-LLaMa is installed by default with the webui on supported systems. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1).')
shared.gradio['exllama_info']=gr.Markdown('For more information, consult the [docs](https://github.com/oobabooga/text-generation-webui/wiki/04-%E2%80%90-Model-Tab#exllama_hf).')
shared.gradio['exllama_HF_info']=gr.Markdown('ExLlama_HF is a wrapper that lets you use ExLlama like a Transformers model, which means it can use the Transformers samplers. It\'s a bit slower than the regular ExLlama.')
shared.gradio['llamacpp_HF_info']=gr.Markdown('llamacpp_HF loads llama.cpp as a Transformers model. To use it, you need to download a tokenizer.\n\nOption 1: download `oobabooga/llama-tokenizer` under "Download model or LoRA". That\'s a default Llama tokenizer.\n\nOption 2: place your .gguf in a subfolder of models/ along with these 3 files: tokenizer.model, tokenizer_config.json, and special_tokens_map.json. This takes precedence over Option 1.')
shared.gradio['autoload_model']=gr.Checkbox(value=shared.settings['autoload_model'],label='Autoload the model',info='Whether to load the model as soon as it is selected in the Model dropdown.',interactive=notmu)
shared.gradio['custom_model_menu']=gr.Textbox(label="Download model or LoRA",info="Enter the Hugging Face username/model path, for instance: facebook/galactica-125m. To specify a branch, add it at the end after a \":\" character like this: facebook/galactica-125m:main. To download a single file, enter its name in the second box.",interactive=notmu)
shared.gradio['download_specific_file']=gr.Textbox(placeholder="File name (for GGUF models)",show_label=False,max_lines=1,interactive=notmu)
output+='\n\nIt seems to be an instruction-following model with template "{}". In the chat tab, instruct or chat-instruct modes should be used.'.format(settings['instruction_template'])