# Parser copied from https://github.com/vladmandic/automatic
parser=argparse.ArgumentParser(description="Text generation web UI",conflict_handler='resolve',add_help=True,formatter_class=lambdaprog:argparse.HelpFormatter(prog,max_help_position=55,indent_increment=2,width=200))
group.add_argument('--multi-user',action='store_true',help='Multi-user mode. Chat histories are not saved or automatically loaded. Warning: this is likely not safe for sharing publicly.')
group.add_argument('--character',type=str,help='The name of the character to load in chat mode by default.')
group.add_argument('--model',type=str,help='Name of the model to load by default.')
group.add_argument('--lora',type=str,nargs='+',help='The list of LoRAs to load. If you want to load more than one LoRA, write the names separated by spaces.')
group.add_argument('--model-dir',type=str,default='models/',help='Path to directory with all the models.')
group.add_argument('--lora-dir',type=str,default='loras/',help='Path to directory with all the loras.')
group.add_argument('--model-menu',action='store_true',help='Show a model menu in the terminal when the web UI is first launched.')
group.add_argument('--settings',type=str,help='Load the default interface settings from this yaml file. See settings-template.yaml for an example. If you create a file called settings.yaml, this file will be loaded by default without the need to use the --settings flag.')
group.add_argument('--extensions',type=str,nargs='+',help='The list of extensions to load. If you want to load more than one extension, write the names separated by spaces.')
group.add_argument('--verbose',action='store_true',help='Print the prompts to the terminal.')
group.add_argument('--idle-timeout',type=int,default=0,help='Unload model after this many minutes of inactivity. It will be automatically reloaded when you try to use it again.')
group.add_argument('--loader',type=str,help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2.')
group.add_argument('--cpu',action='store_true',help='Use the CPU to generate text. Warning: Training on CPU is extremely slow.')
group.add_argument('--auto-devices',action='store_true',help='Automatically split the model across the available GPU(s) and CPU.')
group.add_argument('--gpu-memory',type=str,nargs='+',help='Maximum GPU memory in GiB to be allocated per GPU. Example: --gpu-memory 10 for a single GPU, --gpu-memory 10 5 for two GPUs. You can also set values in MiB like --gpu-memory 3500MiB.')
group.add_argument('--cpu-memory',type=str,help='Maximum CPU memory in GiB to allocate for offloaded weights. Same as above.')
group.add_argument('--disk',action='store_true',help='If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.')
group.add_argument('--disk-cache-dir',type=str,default='cache',help='Directory to save the disk cache to. Defaults to "cache".')
group.add_argument('--load-in-8bit',action='store_true',help='Load the model with 8-bit precision (using bitsandbytes).')
group.add_argument('--bf16',action='store_true',help='Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.')
group.add_argument('--no-cache',action='store_true',help='Set use_cache to False while generating text. This reduces VRAM usage slightly, but it comes at a performance cost.')
group.add_argument('--trust-remote-code',action='store_true',help='Set trust_remote_code=True while loading the model. Necessary for some models.')
group.add_argument('--force-safetensors',action='store_true',help='Set use_safetensors=True while loading the model. This prevents arbitrary code execution.')
group.add_argument('--no_use_fast',action='store_true',help='Set use_fast=False while loading the tokenizer (it\'s True by default). Use this if you have any problems related to use_fast.')
group.add_argument('--use_flash_attention_2',action='store_true',help='Set use_flash_attention_2=True while loading the model.')
group.add_argument('--tensorcores',action='store_true',help='NVIDIA only: use llama-cpp-python compiled with tensor cores support. This may increase performance on newer cards.')
group.add_argument('--tensor_split',type=str,default=None,help='Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.')
group.add_argument('--numa',action='store_true',help='Activate NUMA task allocation for llama.cpp.')
group.add_argument('--logits_all',action='store_true',help='Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower.')
group.add_argument('--no_offload_kqv',action='store_true',help='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.')
group.add_argument('--cache-capacity',type=str,help='Maximum cache capacity (llama-cpp-python). Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed.')
group.add_argument('--streaming-llm',action='store_true',help='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
group.add_argument('--attention-sink-size',type=int,default=5,help='StreamingLLM: number of sink tokens. Only used if the trimmed prompt does not share a prefix with the old prompt.')
group.add_argument('--autosplit',action='store_true',help='Autosplit the model tensors across the available GPUs. This causes --gpu-split to be ignored.')
group.add_argument('--cfg-cache',action='store_true',help='ExLlamav2_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.')
group.add_argument('--hqq-backend',type=str,default='PYTORCH_COMPILE',help='Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.')
group.add_argument('--cpp-runner',action='store_true',help='Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn\'t support streaming yet.')
group.add_argument('--alpha_value',type=float,default=1,help='Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.')
group.add_argument('--rope_freq_base',type=int,default=0,help='If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63).')
group.add_argument('--compress_pos_emb',type=int,default=1,help="Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.")
group.add_argument('--listen',action='store_true',help='Make the web UI reachable from your local network.')
group.add_argument('--listen-port',type=int,help='The listening port that the server will use.')
group.add_argument('--listen-host',type=str,help='The hostname that the server will use.')
group.add_argument('--share',action='store_true',help='Create a public URL. This is useful for running the web UI on Google Colab or similar.')
group.add_argument('--auto-launch',action='store_true',default=False,help='Open the web UI in the default browser upon launch.')
group.add_argument('--gradio-auth',type=str,help='Set Gradio authentication password in the format "username:password". Multiple credentials can also be supplied with "u1:p1,u2:p2,u3:p3".',default=None)
group.add_argument('--gradio-auth-path',type=str,help='Set the Gradio authentication file path. The file should contain one or more user:password pairs in the same format as above.',default=None)
group.add_argument('--ssl-keyfile',type=str,help='The path to the SSL certificate key file.',default=None)
group.add_argument('--ssl-certfile',type=str,help='The path to the SSL certificate cert file.',default=None)
group.add_argument('--admin-key',type=str,default='',help='API authentication key for admin tasks like loading and unloading models. If not set, will be the same as --api-key.')
logger.warning("\nYou are potentially exposing the web UI to the entire internet without any access password.\nYou can create one with the \"--gradio-auth\" flag like this:\n\n--gradio-auth username:password\n\nMake sure to replace username:password with your own.")
ifargs.multi_user:
logger.warning('\nThe multi-user mode is highly experimental and should not be shared publicly.')