AutoGPTQ: Add --disable_exllamav2 flag (Mixtral CPU offloading needs this)

2025-01-12 21:37:35 +01:00 · 2023-12-15 06:46:13 -08:00 · 2023-12-15 06:46:13 -08:00 · 3bbf6c601d
commit 3bbf6c601d
parent 7de10f4c8e
7 changed files with 16 additions and 4 deletions
--- a/README.md
+++ b/README.md
@ -285,6 +285,7 @@ List of command-line flags
 | `--no_use_cuda_fp16`           | This can make models faster on some systems. |
 | `--desc_act`                   | For models that don't have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig. |
 | `--disable_exllama`            | Disable ExLlama kernel, which can improve inference speed on some systems. |
+| `--disable_exllamav2`          | Disable ExLlamav2 kernel. |

 #### GPTQ-for-LLaMa

--- a/modules/AutoGPTQ_loader.py
+++ b/modules/AutoGPTQ_loader.py
@ -52,6 +52,7 @@ def load_quantized(model_name):
        'quantize_config': quantize_config,
        'use_cuda_fp16': not shared.args.no_use_cuda_fp16,
        'disable_exllama': shared.args.disable_exllama,
+        'disable_exllamav2': shared.args.disable_exllamav2,
    }

    logger.info(f"The AutoGPTQ params are: {params}")
--- a/modules/loaders.py
+++ b/modules/loaders.py
@ -25,6 +25,7 @@ loaders_and_params = OrderedDict({
        'rope_freq_base',
        'compress_pos_emb',
        'disable_exllama',
+        'disable_exllamav2',
        'transformers_info'
    ],
    'llama.cpp': [
@ -94,6 +95,7 @@ loaders_and_params = OrderedDict({
        'groupsize',
        'desc_act',
        'disable_exllama',
+        'disable_exllamav2',
        'gpu_memory',
        'cpu_memory',
        'cpu',
--- a/modules/models.py
+++ b/modules/models.py
@ -156,7 +156,7 @@ def huggingface_loader(model_name):
            LoaderClass = AutoModelForCausalLM

    # Load the model in simple 16-bit mode by default
-    if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.load_in_4bit, shared.args.auto_devices, shared.args.disk, shared.args.deepspeed, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.compress_pos_emb > 1, shared.args.alpha_value > 1, shared.args.disable_exllama]):
+    if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.load_in_4bit, shared.args.auto_devices, shared.args.disk, shared.args.deepspeed, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.compress_pos_emb > 1, shared.args.alpha_value > 1, shared.args.disable_exllama, shared.args.disable_exllamav2]):
        model = LoaderClass.from_pretrained(path_to_model, **params)
        if torch.backends.mps.is_available():
            device = torch.device('mps')
@ -221,11 +221,16 @@ def huggingface_loader(model_name):
            if shared.args.disk:
                params['offload_folder'] = shared.args.disk_cache_dir

-        if shared.args.disable_exllama:
+        if shared.args.disable_exllama or shared.args.disable_exllamav2:
            try:
-                gptq_config = GPTQConfig(bits=config.quantization_config.get('bits', 4), disable_exllama=True)
+                gptq_config = GPTQConfig(
+                    bits=config.quantization_config.get('bits', 4),
+                    disable_exllama=shared.args.disable_exllama,
+                    disable_exllamav2=shared.args.disable_exllamav2,
+                )
+
                params['quantization_config'] = gptq_config
-                logger.info('Loading with ExLlama kernel disabled.')
+                logger.info(f'Loading with disable_exllama={shared.args.disable_exllama} and disable_exllamav2={shared.args.disable_exllamav2}.')
            except:
                exc = traceback.format_exc()
                logger.error('Failed to disable exllama. Does the config.json for this model contain the necessary quantization info?')
--- a/modules/shared.py
+++ b/modules/shared.py
@ -133,6 +133,7 @@ parser.add_argument('--no_inject_fused_mlp', action='store_true', help='Triton m
 parser.add_argument('--no_use_cuda_fp16', action='store_true', help='This can make models faster on some systems.')
 parser.add_argument('--desc_act', action='store_true', help='For models that do not have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig.')
 parser.add_argument('--disable_exllama', action='store_true', help='Disable ExLlama kernel, which can improve inference speed on some systems.')
+parser.add_argument('--disable_exllamav2', action='store_true', help='Disable ExLlamav2 kernel.')

 # GPTQ-for-LLaMa
 parser.add_argument('--wbits', type=int, default=0, help='Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.')
--- a/modules/ui.py
+++ b/modules/ui.py
@ -70,6 +70,7 @@ def list_model_elements():
        'no_inject_fused_mlp',
        'no_use_cuda_fp16',
        'disable_exllama',
+        'disable_exllamav2',
        'cfg_cache',
        'no_flash_attn',
        'cache_8bit',
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@ -125,6 +125,7 @@ def create_ui():
                            shared.gradio['logits_all'] = gr.Checkbox(label="logits_all", value=shared.args.logits_all, info='Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower.')
                            shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.')
                            shared.gradio['disable_exllama'] = gr.Checkbox(label="disable_exllama", value=shared.args.disable_exllama, info='Disable ExLlama kernel.')
+                            shared.gradio['disable_exllamav2'] = gr.Checkbox(label="disable_exllamav2", value=shared.args.disable_exllamav2, info='Disable ExLlamav2 kernel.')
                            shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn, info='Force flash-attention to not be used.')
                            shared.gradio['cache_8bit'] = gr.Checkbox(label="cache_8bit", value=shared.args.cache_8bit, info='Use 8-bit cache to save VRAM.')
                            shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.')