mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2024-11-23 00:18:20 +01:00
Use AutoGPTQ by default for GPTQ models
This commit is contained in:
parent
632571a009
commit
f276d88546
18
README.md
18
README.md
@ -244,10 +244,18 @@ Optionally, you can use the following command-line flags:
|
|||||||
| `--n_ctx N_CTX` | Size of the prompt context. |
|
| `--n_ctx N_CTX` | Size of the prompt context. |
|
||||||
| `--llama_cpp_seed SEED` | Seed for llama-cpp models. Default 0 (random). |
|
| `--llama_cpp_seed SEED` | Seed for llama-cpp models. Default 0 (random). |
|
||||||
|
|
||||||
#### GPTQ
|
#### AutoGPTQ
|
||||||
|
|
||||||
|
| Flag | Description |
|
||||||
|
|------------------|-------------|
|
||||||
|
| `--triton` | Use triton. |
|
||||||
|
| `--desc_act` | For models that don't have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig. |
|
||||||
|
|
||||||
|
#### GPTQ-for-LLaMa
|
||||||
|
|
||||||
| Flag | Description |
|
| Flag | Description |
|
||||||
|---------------------------|-------------|
|
|---------------------------|-------------|
|
||||||
|
| `--gptq-for-llama` | Use GPTQ-for-LLaMa to load the GPTQ model instead of AutoGPTQ. |
|
||||||
| `--wbits WBITS` | Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported. |
|
| `--wbits WBITS` | Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported. |
|
||||||
| `--model_type MODEL_TYPE` | Model type of pre-quantized model. Currently LLaMA, OPT, and GPT-J are supported. |
|
| `--model_type MODEL_TYPE` | Model type of pre-quantized model. Currently LLaMA, OPT, and GPT-J are supported. |
|
||||||
| `--groupsize GROUPSIZE` | Group size. |
|
| `--groupsize GROUPSIZE` | Group size. |
|
||||||
@ -258,14 +266,6 @@ Optionally, you can use the following command-line flags:
|
|||||||
| `--warmup_autotune` | (triton) Enable warmup autotune. |
|
| `--warmup_autotune` | (triton) Enable warmup autotune. |
|
||||||
| `--fused_mlp` | (triton) Enable fused mlp. |
|
| `--fused_mlp` | (triton) Enable fused mlp. |
|
||||||
|
|
||||||
#### AutoGPTQ
|
|
||||||
|
|
||||||
| Flag | Description |
|
|
||||||
|------------------|-------------|
|
|
||||||
| `--autogptq` | Use AutoGPTQ for loading quantized models instead of the internal GPTQ loader. |
|
|
||||||
| `--triton` | Use triton. |
|
|
||||||
|` --desc_act` | For models that don't have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig. |
|
|
||||||
|
|
||||||
#### FlexGen
|
#### FlexGen
|
||||||
|
|
||||||
| Flag | Description |
|
| Flag | Description |
|
||||||
|
@ -81,10 +81,10 @@ def load_model(model_name):
|
|||||||
logger.error('The path to the model does not exist. Exiting.')
|
logger.error('The path to the model does not exist. Exiting.')
|
||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
if shared.args.autogptq:
|
if shared.args.gptq_for_llama:
|
||||||
load_func = AutoGPTQ_loader
|
|
||||||
elif shared.args.wbits > 0:
|
|
||||||
load_func = GPTQ_loader
|
load_func = GPTQ_loader
|
||||||
|
elif Path(f'{shared.args.model_dir}/{model_name}/quantize_config.json').exists() or shared.args.wbits > 0:
|
||||||
|
load_func = AutoGPTQ_loader
|
||||||
elif shared.model_type == 'llamacpp':
|
elif shared.model_type == 'llamacpp':
|
||||||
load_func = llamacpp_loader
|
load_func = llamacpp_loader
|
||||||
elif shared.model_type == 'rwkv':
|
elif shared.model_type == 'rwkv':
|
||||||
|
@ -141,7 +141,8 @@ parser.add_argument('--warmup_autotune', action='store_true', help='(triton) Ena
|
|||||||
parser.add_argument('--fused_mlp', action='store_true', help='(triton) Enable fused mlp.')
|
parser.add_argument('--fused_mlp', action='store_true', help='(triton) Enable fused mlp.')
|
||||||
|
|
||||||
# AutoGPTQ
|
# AutoGPTQ
|
||||||
parser.add_argument('--autogptq', action='store_true', help='Use AutoGPTQ for loading quantized models instead of the internal GPTQ loader.')
|
parser.add_argument('--gptq-for-llama', action='store_true', help='Use GPTQ-for-LLaMa to load the GPTQ model instead of AutoGPTQ.')
|
||||||
|
parser.add_argument('--autogptq', action='store_true', help='DEPRECATED')
|
||||||
parser.add_argument('--triton', action='store_true', help='Use triton.')
|
parser.add_argument('--triton', action='store_true', help='Use triton.')
|
||||||
parser.add_argument('--desc_act', action='store_true', help='For models that don\'t have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig.')
|
parser.add_argument('--desc_act', action='store_true', help='For models that don\'t have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig.')
|
||||||
|
|
||||||
@ -181,12 +182,9 @@ parser.add_argument('--multimodal-pipeline', type=str, default=None, help='The m
|
|||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
args_defaults = parser.parse_args([])
|
args_defaults = parser.parse_args([])
|
||||||
|
|
||||||
# Deprecation warnings for parameters that have been renamed
|
# Deprecation warnings
|
||||||
deprecated_dict = {}
|
if args.autogptq:
|
||||||
for k in deprecated_dict:
|
logger.warning('--autogptq has been deprecated and will be removed soon. AutoGPTQ is now used by default for GPTQ models.')
|
||||||
if getattr(args, k) != deprecated_dict[k][1]:
|
|
||||||
logger.warning(f"--{k} is deprecated and will be removed. Use --{deprecated_dict[k][0]} instead.")
|
|
||||||
setattr(args, deprecated_dict[k][0], getattr(args, k))
|
|
||||||
|
|
||||||
# Security warnings
|
# Security warnings
|
||||||
if args.trust_remote_code:
|
if args.trust_remote_code:
|
||||||
|
@ -30,7 +30,7 @@ theme = gr.themes.Default(
|
|||||||
|
|
||||||
|
|
||||||
def list_model_elements():
|
def list_model_elements():
|
||||||
elements = ['cpu_memory', 'auto_devices', 'disk', 'cpu', 'bf16', 'load_in_8bit', 'trust_remote_code', 'load_in_4bit', 'compute_dtype', 'quant_type', 'use_double_quant', 'wbits', 'groupsize', 'model_type', 'pre_layer', 'autogptq', 'triton', 'desc_act', 'threads', 'n_batch', 'no_mmap', 'mlock', 'n_gpu_layers', 'n_ctx', 'llama_cpp_seed']
|
elements = ['cpu_memory', 'auto_devices', 'disk', 'cpu', 'bf16', 'load_in_8bit', 'trust_remote_code', 'load_in_4bit', 'compute_dtype', 'quant_type', 'use_double_quant', 'gptq_for_llama', 'wbits', 'groupsize', 'model_type', 'pre_layer', 'triton', 'desc_act', 'threads', 'n_batch', 'no_mmap', 'mlock', 'n_gpu_layers', 'n_ctx', 'llama_cpp_seed']
|
||||||
for i in range(torch.cuda.device_count()):
|
for i in range(torch.cuda.device_count()):
|
||||||
elements.append(f'gpu_memory_{i}')
|
elements.append(f'gpu_memory_{i}')
|
||||||
|
|
||||||
|
@ -393,12 +393,12 @@ def create_model_menus():
|
|||||||
with gr.Row():
|
with gr.Row():
|
||||||
with gr.Column():
|
with gr.Column():
|
||||||
gr.Markdown('AutoGPTQ')
|
gr.Markdown('AutoGPTQ')
|
||||||
shared.gradio['autogptq'] = gr.Checkbox(label="autogptq", value=shared.args.autogptq, info='Activate AutoGPTQ loader. gpu-memory should be used for CPU offloading instead of pre_layer.')
|
|
||||||
shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)
|
shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)
|
||||||
shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.')
|
shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.')
|
||||||
|
|
||||||
with gr.Column():
|
with gr.Column():
|
||||||
gr.Markdown('GPTQ-for-LLaMa')
|
gr.Markdown('GPTQ-for-LLaMa')
|
||||||
|
shared.gradio['gptq_for_llama'] = gr.Checkbox(label="gptq-for-llama", value=shared.args.gptq_for_llama, info='Use GPTQ-for-LLaMa to load the GPTQ model instead of AutoGPTQ. pre_layer should be used for CPU offloading instead of gpu-memory.')
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=shared.args.wbits if shared.args.wbits > 0 else "None")
|
shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=shared.args.wbits if shared.args.wbits > 0 else "None")
|
||||||
shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=shared.args.groupsize if shared.args.groupsize > 0 else "None")
|
shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=shared.args.groupsize if shared.args.groupsize > 0 else "None")
|
||||||
|
Loading…
Reference in New Issue
Block a user