Streamline GPTQ-for-LLaMa support

2024-11-22 08:07:56 +01:00 · 2023-08-09 23:42:34 -05:00 · 2023-08-09 23:42:34 -05:00 · bee73cedbd
commit bee73cedbd
parent a3295dd666
5 changed files with 21 additions and 55 deletions
--- a/README.md
+++ b/README.md
@ -280,9 +280,6 @@ Optionally, you can use the following command-line flags:
 | `--pre_layer PRE_LAYER [PRE_LAYER ...]`  | The number of layers to allocate to the GPU. Setting this parameter enables CPU offloading for 4-bit models. For multi-gpu, write the numbers separated by spaces, eg `--pre_layer 30 60`. |
 | `--checkpoint CHECKPOINT` | The path to the quantized checkpoint file. If not specified, it will be automatically detected. |
 | `--monkey-patch`          | Apply the monkey patch for using LoRAs with quantized models.
 | `--quant_attn`         | (triton) Enable quant attention. |
 | `--warmup_autotune`    | (triton) Enable warmup autotune. |
 | `--fused_mlp`          | (triton) Enable fused mlp. |
 #### DeepSpeed
--- a/modules/GPTQ_loader.py
+++ b/modules/GPTQ_loader.py
@ -11,26 +11,9 @@ from transformers import AutoConfig, AutoModelForCausalLM
 import modules.shared as shared
 from modules.logging_colors import logger
-sys.path.insert(0, str(Path("repositories/GPTQ-for-LLaMa")))
+from gptq_for_llama import llama_inference_offload
-
+from gptq_for_llama.modelutils import find_layers
-try:
+from gptq_for_llama.quant import make_quant
    import llama_inference_offload
 except ImportError:
    logger.error('Failed to load GPTQ-for-LLaMa')
    logger.error('See https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md')
    sys.exit(-1)
 try:
    from modelutils import find_layers
 except ImportError:
    from utils import find_layers
 try:
    from quant import make_quant
    is_triton = False
 except ImportError:
    import quant
    is_triton = True
 # This function is a replacement for the load_quant function in the
@ -59,24 +42,21 @@ def _load_quant(model, checkpoint, wbits, groupsize=-1, faster_kernel=False, exc
        if name in layers:
            del layers[name]
-    if not is_triton:
+    gptq_args = inspect.getfullargspec(make_quant).args
        gptq_args = inspect.getfullargspec(make_quant).args
-        make_quant_kwargs = {
+    make_quant_kwargs = {
-            'module': model,
+        'module': model,
-            'names': layers,
+        'names': layers,
-            'bits': wbits,
+        'bits': wbits,
-        }
+    }
-        if 'groupsize' in gptq_args:
+    if 'groupsize' in gptq_args:
-            make_quant_kwargs['groupsize'] = groupsize
+        make_quant_kwargs['groupsize'] = groupsize
-        if 'faster' in gptq_args:
+    if 'faster' in gptq_args:
-            make_quant_kwargs['faster'] = faster_kernel
+        make_quant_kwargs['faster'] = faster_kernel
-        if 'kernel_switch_threshold' in gptq_args:
+    if 'kernel_switch_threshold' in gptq_args:
-            make_quant_kwargs['kernel_switch_threshold'] = kernel_switch_threshold
+        make_quant_kwargs['kernel_switch_threshold'] = kernel_switch_threshold
-        make_quant(**make_quant_kwargs)
+    make_quant(**make_quant_kwargs)
    else:
        quant.make_quant_linear(model, layers, wbits, groupsize)
    del layers
    if checkpoint.endswith('.safetensors'):
@ -85,18 +65,6 @@ def _load_quant(model, checkpoint, wbits, groupsize=-1, faster_kernel=False, exc
    else:
        model.load_state_dict(torch.load(checkpoint), strict=False)
    if is_triton:
        if shared.args.quant_attn:
            quant.make_quant_attn(model)
        if eval and shared.args.fused_mlp:
            quant.make_fused_mlp(model)
        if shared.args.warmup_autotune:
            quant.autotune_warmup_linear(model, transpose=not eval)
            if eval and shared.args.fused_mlp:
                quant.autotune_warmup_fused(model)
    model.seqlen = 2048
    return model
--- a/modules/shared.py
+++ b/modules/shared.py
@ -138,9 +138,6 @@ parser.add_argument('--groupsize', type=int, default=-1, help='Group size.')
 parser.add_argument('--pre_layer', type=int, nargs="+", help='The number of layers to allocate to the GPU. Setting this parameter enables CPU offloading for 4-bit models. For multi-gpu, write the numbers separated by spaces, eg --pre_layer 30 60.')
 parser.add_argument('--checkpoint', type=str, help='The path to the quantized checkpoint file. If not specified, it will be automatically detected.')
 parser.add_argument('--monkey-patch', action='store_true', help='Apply the monkey patch for using LoRAs with quantized models.')
 parser.add_argument('--quant_attn', action='store_true', help='(triton) Enable quant attention.')
 parser.add_argument('--warmup_autotune', action='store_true', help='(triton) Enable warmup autotune.')
 parser.add_argument('--fused_mlp', action='store_true', help='(triton) Enable fused mlp.')
 # AutoGPTQ
 parser.add_argument('--triton', action='store_true', help='Use triton.')
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@ -110,7 +110,7 @@ def create_ui():
                            shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
                            shared.gradio['llama_cpp_seed'] = gr.Number(label='Seed (0 for random)', value=shared.args.llama_cpp_seed)
                            shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Make sure to inspect the .py files inside the model folder before loading it with this option enabled.')
-                            shared.gradio['gptq_for_llama_info'] = gr.Markdown('GPTQ-for-LLaMa is currently 2x faster than AutoGPTQ on some systems. It is installed by default with the one-click installers. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1).')
+                            shared.gradio['gptq_for_llama_info'] = gr.Markdown('GPTQ-for-LLaMa support is currently only kept for compatibility with older GPUs. AutoGPTQ or ExLlama is preferred when compatible. GPTQ-for-LLaMa is installed by default with the one-click installers. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1).')
                            shared.gradio['exllama_info'] = gr.Markdown('For more information, consult the [docs](https://github.com/oobabooga/text-generation-webui/blob/main/docs/ExLlama.md).')
                            shared.gradio['exllama_HF_info'] = gr.Markdown('ExLlama_HF is a wrapper that lets you use ExLlama like a Transformers model, which means it can use the Transformers samplers. It\'s a bit slower than the regular ExLlama.')
                            shared.gradio['llamacpp_HF_info'] = gr.Markdown('llamacpp_HF is a wrapper that lets you use llama.cpp like a Transformers model, which means it can use the Transformers samplers. To use it, make sure to first download oobabooga/llama-tokenizer under "Download custom model or LoRA".')
--- a/requirements.txt
+++ b/requirements.txt
@ -36,3 +36,7 @@ https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.77/llama_cpp_
 # llama-cpp-python with CUDA support
 https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.1.77+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
 https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.1.77+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 # GPTQ-for-LLaMa
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"