mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2024-11-29 02:49:30 +01:00
Add the --disable_exllama option for AutoGPTQ
This commit is contained in:
parent
0e05818266
commit
0230fa4e9c
@ -262,6 +262,7 @@ Optionally, you can use the following command-line flags:
|
|||||||
| `--no_inject_fused_mlp` | Triton mode only: disable the use of fused MLP, which will use less VRAM at the cost of slower inference. |
|
| `--no_inject_fused_mlp` | Triton mode only: disable the use of fused MLP, which will use less VRAM at the cost of slower inference. |
|
||||||
| `--no_use_cuda_fp16` | This can make models faster on some systems. |
|
| `--no_use_cuda_fp16` | This can make models faster on some systems. |
|
||||||
| `--desc_act` | For models that don't have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig. |
|
| `--desc_act` | For models that don't have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig. |
|
||||||
|
| `--disable_exllama` | Disable ExLlama kernel, which can improve inference speed on some systems. |
|
||||||
|
|
||||||
#### ExLlama
|
#### ExLlama
|
||||||
|
|
||||||
|
@ -50,6 +50,7 @@ def load_quantized(model_name):
|
|||||||
'max_memory': get_max_memory_dict(),
|
'max_memory': get_max_memory_dict(),
|
||||||
'quantize_config': quantize_config,
|
'quantize_config': quantize_config,
|
||||||
'use_cuda_fp16': not shared.args.no_use_cuda_fp16,
|
'use_cuda_fp16': not shared.args.no_use_cuda_fp16,
|
||||||
|
'disable_exllama': shared.args.disable_exllama,
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.info(f"The AutoGPTQ params are: {params}")
|
logger.info(f"The AutoGPTQ params are: {params}")
|
||||||
|
@ -46,6 +46,7 @@ loaders_and_params = OrderedDict({
|
|||||||
'wbits',
|
'wbits',
|
||||||
'groupsize',
|
'groupsize',
|
||||||
'desc_act',
|
'desc_act',
|
||||||
|
'disable_exllama',
|
||||||
'gpu_memory',
|
'gpu_memory',
|
||||||
'cpu_memory',
|
'cpu_memory',
|
||||||
'cpu',
|
'cpu',
|
||||||
|
@ -145,6 +145,7 @@ parser.add_argument('--no_inject_fused_attention', action='store_true', help='Do
|
|||||||
parser.add_argument('--no_inject_fused_mlp', action='store_true', help='Triton mode only: Do not use fused MLP (lowers VRAM requirements).')
|
parser.add_argument('--no_inject_fused_mlp', action='store_true', help='Triton mode only: Do not use fused MLP (lowers VRAM requirements).')
|
||||||
parser.add_argument('--no_use_cuda_fp16', action='store_true', help='This can make models faster on some systems.')
|
parser.add_argument('--no_use_cuda_fp16', action='store_true', help='This can make models faster on some systems.')
|
||||||
parser.add_argument('--desc_act', action='store_true', help='For models that don\'t have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig.')
|
parser.add_argument('--desc_act', action='store_true', help='For models that don\'t have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig.')
|
||||||
|
parser.add_argument('--disable_exllama', action='store_true', help='Disable ExLlama kernel, which can improve inference speed on some systems.')
|
||||||
|
|
||||||
# ExLlama
|
# ExLlama
|
||||||
parser.add_argument('--gpu-split', type=str, help="Comma-separated list of VRAM (in GB) to use per GPU device for model layers, e.g. 20,7,7")
|
parser.add_argument('--gpu-split', type=str, help="Comma-separated list of VRAM (in GB) to use per GPU device for model layers, e.g. 20,7,7")
|
||||||
|
@ -58,6 +58,7 @@ def list_model_elements():
|
|||||||
'no_inject_fused_attention',
|
'no_inject_fused_attention',
|
||||||
'no_inject_fused_mlp',
|
'no_inject_fused_mlp',
|
||||||
'no_use_cuda_fp16',
|
'no_use_cuda_fp16',
|
||||||
|
'disable_exllama',
|
||||||
'threads',
|
'threads',
|
||||||
'n_batch',
|
'n_batch',
|
||||||
'no_mmap',
|
'no_mmap',
|
||||||
|
@ -98,6 +98,7 @@ def create_ui():
|
|||||||
shared.gradio['no_inject_fused_mlp'] = gr.Checkbox(label="no_inject_fused_mlp", value=shared.args.no_inject_fused_mlp, info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.')
|
shared.gradio['no_inject_fused_mlp'] = gr.Checkbox(label="no_inject_fused_mlp", value=shared.args.no_inject_fused_mlp, info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.')
|
||||||
shared.gradio['no_use_cuda_fp16'] = gr.Checkbox(label="no_use_cuda_fp16", value=shared.args.no_use_cuda_fp16, info='This can make models faster on some systems.')
|
shared.gradio['no_use_cuda_fp16'] = gr.Checkbox(label="no_use_cuda_fp16", value=shared.args.no_use_cuda_fp16, info='This can make models faster on some systems.')
|
||||||
shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.')
|
shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.')
|
||||||
|
shared.gradio['disable_exllama'] = gr.Checkbox(label="disable_exllama", value=shared.args.disable_exllama, info='Disable ExLlama kernel, which can improve inference speed on some systems.')
|
||||||
shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu)
|
shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu)
|
||||||
shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
|
shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
|
||||||
shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16)
|
shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16)
|
||||||
|
Loading…
Reference in New Issue
Block a user