diff --git a/README.md b/README.md index 6694e500..278e5e3a 100644 --- a/README.md +++ b/README.md @@ -262,6 +262,7 @@ Optionally, you can use the following command-line flags: | `--no_inject_fused_mlp` | Triton mode only: disable the use of fused MLP, which will use less VRAM at the cost of slower inference. | | `--no_use_cuda_fp16` | This can make models faster on some systems. | | `--desc_act` | For models that don't have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig. | +| `--disable_exllama` | Disable ExLlama kernel, which can improve inference speed on some systems. | #### ExLlama diff --git a/modules/AutoGPTQ_loader.py b/modules/AutoGPTQ_loader.py index 0d41ac0a..987f5ba7 100644 --- a/modules/AutoGPTQ_loader.py +++ b/modules/AutoGPTQ_loader.py @@ -50,6 +50,7 @@ def load_quantized(model_name): 'max_memory': get_max_memory_dict(), 'quantize_config': quantize_config, 'use_cuda_fp16': not shared.args.no_use_cuda_fp16, + 'disable_exllama': shared.args.disable_exllama, } logger.info(f"The AutoGPTQ params are: {params}") diff --git a/modules/loaders.py b/modules/loaders.py index 9a222a72..a96c43ea 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -46,6 +46,7 @@ loaders_and_params = OrderedDict({ 'wbits', 'groupsize', 'desc_act', + 'disable_exllama', 'gpu_memory', 'cpu_memory', 'cpu', diff --git a/modules/shared.py b/modules/shared.py index cb6f0ae1..ba89fb52 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -145,6 +145,7 @@ parser.add_argument('--no_inject_fused_attention', action='store_true', help='Do parser.add_argument('--no_inject_fused_mlp', action='store_true', help='Triton mode only: Do not use fused MLP (lowers VRAM requirements).') parser.add_argument('--no_use_cuda_fp16', action='store_true', help='This can make models faster on some systems.') parser.add_argument('--desc_act', action='store_true', help='For models that don\'t have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig.') +parser.add_argument('--disable_exllama', action='store_true', help='Disable ExLlama kernel, which can improve inference speed on some systems.') # ExLlama parser.add_argument('--gpu-split', type=str, help="Comma-separated list of VRAM (in GB) to use per GPU device for model layers, e.g. 20,7,7") diff --git a/modules/ui.py b/modules/ui.py index b58b7dd6..37284d25 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -58,6 +58,7 @@ def list_model_elements(): 'no_inject_fused_attention', 'no_inject_fused_mlp', 'no_use_cuda_fp16', + 'disable_exllama', 'threads', 'n_batch', 'no_mmap', diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 7b852a44..3059f616 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -98,6 +98,7 @@ def create_ui(): shared.gradio['no_inject_fused_mlp'] = gr.Checkbox(label="no_inject_fused_mlp", value=shared.args.no_inject_fused_mlp, info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.') shared.gradio['no_use_cuda_fp16'] = gr.Checkbox(label="no_use_cuda_fp16", value=shared.args.no_use_cuda_fp16, info='This can make models faster on some systems.') shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.') + shared.gradio['disable_exllama'] = gr.Checkbox(label="disable_exllama", value=shared.args.disable_exllama, info='Disable ExLlama kernel, which can improve inference speed on some systems.') shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu) shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit) shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16)