From f276d88546a5a3ec9b3ddb2c71d0b24d46afd23f Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 5 Jun 2023 15:41:48 -0300 Subject: [PATCH] Use AutoGPTQ by default for GPTQ models --- README.md | 18 +++++++++--------- modules/models.py | 6 +++--- modules/shared.py | 12 +++++------- modules/ui.py | 2 +- server.py | 2 +- 5 files changed, 19 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index cfb90c46..b8ea025f 100644 --- a/README.md +++ b/README.md @@ -244,10 +244,18 @@ Optionally, you can use the following command-line flags: | `--n_ctx N_CTX` | Size of the prompt context. | | `--llama_cpp_seed SEED` | Seed for llama-cpp models. Default 0 (random). | -#### GPTQ +#### AutoGPTQ + +| Flag | Description | +|------------------|-------------| +| `--triton` | Use triton. | +| `--desc_act` | For models that don't have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig. | + +#### GPTQ-for-LLaMa | Flag | Description | |---------------------------|-------------| +| `--gptq-for-llama` | Use GPTQ-for-LLaMa to load the GPTQ model instead of AutoGPTQ. | | `--wbits WBITS` | Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported. | | `--model_type MODEL_TYPE` | Model type of pre-quantized model. Currently LLaMA, OPT, and GPT-J are supported. | | `--groupsize GROUPSIZE` | Group size. | @@ -258,14 +266,6 @@ Optionally, you can use the following command-line flags: | `--warmup_autotune` | (triton) Enable warmup autotune. | | `--fused_mlp` | (triton) Enable fused mlp. | -#### AutoGPTQ - -| Flag | Description | -|------------------|-------------| -| `--autogptq` | Use AutoGPTQ for loading quantized models instead of the internal GPTQ loader. | -| `--triton` | Use triton. | -|` --desc_act` | For models that don't have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig. | - #### FlexGen | Flag | Description | diff --git a/modules/models.py b/modules/models.py index 575f28e1..3972133a 100644 --- a/modules/models.py +++ b/modules/models.py @@ -81,10 +81,10 @@ def load_model(model_name): logger.error('The path to the model does not exist. Exiting.') return None, None - if shared.args.autogptq: - load_func = AutoGPTQ_loader - elif shared.args.wbits > 0: + if shared.args.gptq_for_llama: load_func = GPTQ_loader + elif Path(f'{shared.args.model_dir}/{model_name}/quantize_config.json').exists() or shared.args.wbits > 0: + load_func = AutoGPTQ_loader elif shared.model_type == 'llamacpp': load_func = llamacpp_loader elif shared.model_type == 'rwkv': diff --git a/modules/shared.py b/modules/shared.py index 9a025587..d57efef4 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -141,7 +141,8 @@ parser.add_argument('--warmup_autotune', action='store_true', help='(triton) Ena parser.add_argument('--fused_mlp', action='store_true', help='(triton) Enable fused mlp.') # AutoGPTQ -parser.add_argument('--autogptq', action='store_true', help='Use AutoGPTQ for loading quantized models instead of the internal GPTQ loader.') +parser.add_argument('--gptq-for-llama', action='store_true', help='Use GPTQ-for-LLaMa to load the GPTQ model instead of AutoGPTQ.') +parser.add_argument('--autogptq', action='store_true', help='DEPRECATED') parser.add_argument('--triton', action='store_true', help='Use triton.') parser.add_argument('--desc_act', action='store_true', help='For models that don\'t have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig.') @@ -181,12 +182,9 @@ parser.add_argument('--multimodal-pipeline', type=str, default=None, help='The m args = parser.parse_args() args_defaults = parser.parse_args([]) -# Deprecation warnings for parameters that have been renamed -deprecated_dict = {} -for k in deprecated_dict: - if getattr(args, k) != deprecated_dict[k][1]: - logger.warning(f"--{k} is deprecated and will be removed. Use --{deprecated_dict[k][0]} instead.") - setattr(args, deprecated_dict[k][0], getattr(args, k)) +# Deprecation warnings +if args.autogptq: + logger.warning('--autogptq has been deprecated and will be removed soon. AutoGPTQ is now used by default for GPTQ models.') # Security warnings if args.trust_remote_code: diff --git a/modules/ui.py b/modules/ui.py index 62796032..a10edec2 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -30,7 +30,7 @@ theme = gr.themes.Default( def list_model_elements(): - elements = ['cpu_memory', 'auto_devices', 'disk', 'cpu', 'bf16', 'load_in_8bit', 'trust_remote_code', 'load_in_4bit', 'compute_dtype', 'quant_type', 'use_double_quant', 'wbits', 'groupsize', 'model_type', 'pre_layer', 'autogptq', 'triton', 'desc_act', 'threads', 'n_batch', 'no_mmap', 'mlock', 'n_gpu_layers', 'n_ctx', 'llama_cpp_seed'] + elements = ['cpu_memory', 'auto_devices', 'disk', 'cpu', 'bf16', 'load_in_8bit', 'trust_remote_code', 'load_in_4bit', 'compute_dtype', 'quant_type', 'use_double_quant', 'gptq_for_llama', 'wbits', 'groupsize', 'model_type', 'pre_layer', 'triton', 'desc_act', 'threads', 'n_batch', 'no_mmap', 'mlock', 'n_gpu_layers', 'n_ctx', 'llama_cpp_seed'] for i in range(torch.cuda.device_count()): elements.append(f'gpu_memory_{i}') diff --git a/server.py b/server.py index 8bfb45aa..ce7086a5 100644 --- a/server.py +++ b/server.py @@ -393,12 +393,12 @@ def create_model_menus(): with gr.Row(): with gr.Column(): gr.Markdown('AutoGPTQ') - shared.gradio['autogptq'] = gr.Checkbox(label="autogptq", value=shared.args.autogptq, info='Activate AutoGPTQ loader. gpu-memory should be used for CPU offloading instead of pre_layer.') shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton) shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.') with gr.Column(): gr.Markdown('GPTQ-for-LLaMa') + shared.gradio['gptq_for_llama'] = gr.Checkbox(label="gptq-for-llama", value=shared.args.gptq_for_llama, info='Use GPTQ-for-LLaMa to load the GPTQ model instead of AutoGPTQ. pre_layer should be used for CPU offloading instead of gpu-memory.') with gr.Row(): shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=shared.args.wbits if shared.args.wbits > 0 else "None") shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=shared.args.groupsize if shared.args.groupsize > 0 else "None")