From 7157257c3f4691b9e4b56ceddb8c428802ac4c54 Mon Sep 17 00:00:00 2001
From: oobabooga <oobabooga4@gmail.com>
Date: Wed, 8 Jan 2025 19:28:56 -0300
Subject: [PATCH] Remove the AutoGPTQ loader (#6641)

---
 README.md                  |  2 +-
 modules/AutoGPTQ_loader.py | 74 --------------------------------------
 modules/LoRA.py            | 38 ++------------------
 modules/loaders.py         | 21 -----------
 modules/models.py          | 29 +--------------
 modules/models_settings.py | 41 +++------------------
 modules/shared.py          | 23 +++++-------
 modules/ui.py              |  8 -----
 modules/ui_model_menu.py   |  9 -----
 one_click.py               |  2 +-
 10 files changed, 19 insertions(+), 228 deletions(-)
 delete mode 100644 modules/AutoGPTQ_loader.py

diff --git a/README.md b/README.md
index 0e16aa30..bec686ad 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 
 ## Features
 
-- Supports multiple text generation backends in one UI/API, including [Transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp), and [ExLlamaV2](https://github.com/turboderp/exllamav2). [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [HQQ](https://github.com/mobiusml/hqq), and [AQLM](https://github.com/Vahe1994/AQLM) are also supported but you need to install them manually.
+- Supports multiple text generation backends in a single UI/API, including [Transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp), and [ExLlamaV2](https://github.com/turboderp-org/exllamav2). [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) is supported via its own [Dockerfile](https://github.com/oobabooga/text-generation-webui/blob/main/docker/TensorRT-LLM/Dockerfile), and the Transformers loader is compatible with libraries like [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [HQQ](https://github.com/mobiusml/hqq), and [AQLM](https://github.com/Vahe1994/AQLM), but they must be installed manually.
 - OpenAI-compatible API with Chat and Completions endpoints – see [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples).
 - Automatic prompt formatting using Jinja2 templates.
 - Three chat modes: `instruct`, `chat-instruct`, and `chat`, with automatic prompt templates in `chat-instruct`.
diff --git a/modules/AutoGPTQ_loader.py b/modules/AutoGPTQ_loader.py
deleted file mode 100644
index 69e8f299..00000000
--- a/modules/AutoGPTQ_loader.py
+++ /dev/null
@@ -1,74 +0,0 @@
-from pathlib import Path
-
-from accelerate.utils import is_xpu_available
-from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
-
-import modules.shared as shared
-from modules.logging_colors import logger
-from modules.models import get_max_memory_dict
-
-
-def load_quantized(model_name):
-    path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
-    pt_path = None
-
-    # Find the model checkpoint
-    if shared.args.checkpoint:
-        pt_path = Path(shared.args.checkpoint)
-    else:
-        for ext in ['.safetensors', '.pt', '.bin']:
-            found = list(path_to_model.glob(f"*{ext}"))
-            if len(found) > 0:
-                if len(found) > 1:
-                    logger.warning(f'More than one {ext} model has been found. The last one will be selected. It could be wrong.')
-
-                pt_path = found[-1]
-                break
-
-    if pt_path is None:
-        logger.error("The model could not be loaded because its checkpoint file in .bin/.pt/.safetensors format could not be located.")
-        return
-
-    use_safetensors = pt_path.suffix == '.safetensors'
-    if not (path_to_model / "quantize_config.json").exists():
-        quantize_config = BaseQuantizeConfig(
-            bits=bits if (bits := shared.args.wbits) > 0 else 4,
-            group_size=gs if (gs := shared.args.groupsize) > 0 else -1,
-            desc_act=shared.args.desc_act
-        )
-    else:
-        quantize_config = None
-
-    # Define the params for AutoGPTQForCausalLM.from_quantized
-    params = {
-        'model_basename': pt_path.stem,
-        'device': "xpu:0" if is_xpu_available() else "cuda:0" if not shared.args.cpu else "cpu",
-        'use_triton': shared.args.triton,
-        'inject_fused_attention': False,
-        'inject_fused_mlp': not shared.args.no_inject_fused_mlp,
-        'use_safetensors': use_safetensors,
-        'trust_remote_code': shared.args.trust_remote_code,
-        'max_memory': get_max_memory_dict(),
-        'quantize_config': quantize_config,
-        'use_cuda_fp16': not shared.args.no_use_cuda_fp16,
-        'disable_exllama': shared.args.disable_exllama,
-        'disable_exllamav2': shared.args.disable_exllamav2,
-    }
-
-    logger.info(f"The AutoGPTQ params are: {params}")
-    model = AutoGPTQForCausalLM.from_quantized(path_to_model, **params)
-
-    # These lines fix the multimodal extension when used with AutoGPTQ
-    if hasattr(model, 'model'):
-        if not hasattr(model, 'dtype'):
-            if hasattr(model.model, 'dtype'):
-                model.dtype = model.model.dtype
-
-        if hasattr(model.model, 'model') and hasattr(model.model.model, 'embed_tokens'):
-            if not hasattr(model, 'embed_tokens'):
-                model.embed_tokens = model.model.model.embed_tokens
-
-            if not hasattr(model.model, 'embed_tokens'):
-                model.model.embed_tokens = model.model.model.embed_tokens
-
-    return model
diff --git a/modules/LoRA.py b/modules/LoRA.py
index e1ad01d7..1f4883e2 100644
--- a/modules/LoRA.py
+++ b/modules/LoRA.py
@@ -2,13 +2,11 @@ from pathlib import Path
 
 import modules.shared as shared
 from modules.logging_colors import logger
-from modules.models import get_device, reload_model
+from modules.models import get_device
 
 
 def add_lora_to_model(lora_names):
-    if 'GPTQForCausalLM' in shared.model.__class__.__name__ or shared.args.loader == 'AutoGPTQ':
-        add_lora_autogptq(lora_names)
-    elif shared.model.__class__.__name__ in ['Exllamav2Model', 'Exllamav2HF'] or shared.args.loader in ['ExLlamav2', 'ExLlamav2_HF']:
+    if shared.model.__class__.__name__ in ['Exllamav2Model', 'Exllamav2HF'] or shared.args.loader in ['ExLlamav2', 'ExLlamav2_HF']:
         add_lora_exllamav2(lora_names)
     else:
         add_lora_transformers(lora_names)
@@ -48,38 +46,6 @@ def add_lora_exllamav2(lora_names):
         shared.model.loras = None
 
 
-def add_lora_autogptq(lora_names):
-    '''
-    Adapted from https://github.com/Ph0rk0z/text-generation-webui-testing
-    '''
-
-    try:
-        from auto_gptq import get_gptq_peft_model
-        from auto_gptq.utils.peft_utils import GPTQLoraConfig
-    except:
-        logger.error("This version of AutoGPTQ does not support LoRA. You need to install from source or wait for a new release.")
-        return
-
-    if len(lora_names) == 0:
-        reload_model()
-
-        shared.lora_names = []
-        return
-    else:
-        if len(lora_names) > 1:
-            logger.warning('AutoGPTQ can only work with 1 LoRA at the moment. Only the first one in the list will be loaded.')
-
-        peft_config = GPTQLoraConfig(
-            inference_mode=True,
-        )
-
-        lora_path = get_lora_path(lora_names[0])
-        logger.info("Applying the following LoRAs to {}: {}".format(shared.model_name, ', '.join([lora_names[0]])))
-        shared.model = get_gptq_peft_model(shared.model, peft_config, lora_path)
-        shared.lora_names = [lora_names[0]]
-        return
-
-
 def add_lora_transformers(lora_names):
 
     from peft import PeftModel
diff --git a/modules/loaders.py b/modules/loaders.py
index 191126b3..4e331dbb 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -25,8 +25,6 @@ loaders_and_params = OrderedDict({
         'use_eager_attention',
         'alpha_value',
         'compress_pos_emb',
-        'disable_exllama',
-        'disable_exllamav2',
     ],
     'llama.cpp': [
         'n_ctx',
@@ -107,24 +105,6 @@ loaders_and_params = OrderedDict({
         'compress_pos_emb',
         'exllamav2_info',
     ],
-    'AutoGPTQ': [
-        'triton',
-        'no_inject_fused_mlp',
-        'no_use_cuda_fp16',
-        'wbits',
-        'groupsize',
-        'desc_act',
-        'disable_exllama',
-        'disable_exllamav2',
-        'gpu_memory',
-        'cpu_memory',
-        'cpu',
-        'disk',
-        'auto_devices',
-        'trust_remote_code',
-        'no_use_fast',
-        'autogptq_info',
-    ],
     'HQQ': [
         'hqq_backend',
         'trust_remote_code',
@@ -191,7 +171,6 @@ def transformers_samplers():
 
 loaders_samplers = {
     'Transformers': transformers_samplers(),
-    'AutoGPTQ': transformers_samplers(),
     'HQQ': transformers_samplers(),
     'ExLlamav2': {
         'temperature',
diff --git a/modules/models.py b/modules/models.py
index cb1ba218..9c58b279 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -3,7 +3,6 @@ import os
 import pprint
 import re
 import time
-import traceback
 from pathlib import Path
 
 import torch
@@ -21,7 +20,6 @@ from transformers import (
     AutoModelForSeq2SeqLM,
     AutoTokenizer,
     BitsAndBytesConfig,
-    GPTQConfig,
     is_torch_npu_available,
     is_torch_xpu_available
 )
@@ -73,7 +71,6 @@ def load_model(model_name, loader=None):
         'llamacpp_HF': llamacpp_HF_loader,
         'ExLlamav2': ExLlamav2_loader,
         'ExLlamav2_HF': ExLlamav2_HF_loader,
-        'AutoGPTQ': AutoGPTQ_loader,
         'HQQ': HQQ_loader,
         'TensorRT-LLM': TensorRT_LLM_loader,
     }
@@ -164,7 +161,7 @@ def huggingface_loader(model_name):
             LoaderClass = AutoModelForCausalLM
 
     # Load the model without any special settings
-    if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.load_in_4bit, shared.args.auto_devices, shared.args.disk, shared.args.deepspeed, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.compress_pos_emb > 1, shared.args.alpha_value > 1, shared.args.disable_exllama, shared.args.disable_exllamav2]):
+    if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.load_in_4bit, shared.args.auto_devices, shared.args.disk, shared.args.deepspeed, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.compress_pos_emb > 1, shared.args.alpha_value > 1]):
         logger.info("TRANSFORMERS_PARAMS=")
         pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(params)
         print()
@@ -229,21 +226,6 @@ def huggingface_loader(model_name):
             if shared.args.disk:
                 params['offload_folder'] = shared.args.disk_cache_dir
 
-        if shared.args.disable_exllama or shared.args.disable_exllamav2:
-            try:
-                gptq_config = GPTQConfig(
-                    bits=config.quantization_config.get('bits', 4),
-                    disable_exllama=shared.args.disable_exllama,
-                    disable_exllamav2=shared.args.disable_exllamav2,
-                )
-
-                params['quantization_config'] = gptq_config
-                logger.info(f'Loading with disable_exllama={shared.args.disable_exllama} and disable_exllamav2={shared.args.disable_exllamav2}.')
-            except:
-                exc = traceback.format_exc()
-                logger.error('Failed to disable exllama. Does the config.json for this model contain the necessary quantization info?')
-                print(exc)
-
         if shared.args.compress_pos_emb > 1:
             params['rope_scaling'] = {'type': 'linear', 'factor': shared.args.compress_pos_emb}
         elif shared.args.alpha_value > 1:
@@ -310,15 +292,6 @@ def ExLlamav2_HF_loader(model_name):
     return Exllamav2HF.from_pretrained(model_name)
 
 
-def AutoGPTQ_loader(model_name):
-    try:
-        import modules.AutoGPTQ_loader
-    except ModuleNotFoundError:
-        raise ModuleNotFoundError("Failed to import 'autogptq'. Please install it manually following the instructions in the AutoGPTQ GitHub repository.")
-
-    return modules.AutoGPTQ_loader.load_quantized(model_name)
-
-
 def HQQ_loader(model_name):
     try:
         from hqq.core.quantize import HQQBackend, HQQLinear
diff --git a/modules/models_settings.py b/modules/models_settings.py
index 1bb00ceb..8d658523 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -11,9 +11,6 @@ def get_fallback_settings():
     return {
         'bf16': False,
         'use_eager_attention': False,
-        'wbits': 'None',
-        'groupsize': 'None',
-        'desc_act': False,
         'max_seq_len': 2048,
         'n_ctx': 2048,
         'rope_freq_base': 0,
@@ -111,26 +108,6 @@ def get_model_metadata(model):
             if 'architectures' in metadata and isinstance(metadata['architectures'], list) and 'Gemma2ForCausalLM' in metadata['architectures']:
                 model_settings['use_eager_attention'] = True
 
-            # Read GPTQ metadata for old GPTQ loaders
-            if 'quantization_config' in metadata and metadata['quantization_config'].get('quant_method', '') != 'exl2':
-                if 'bits' in metadata['quantization_config']:
-                    model_settings['wbits'] = metadata['quantization_config']['bits']
-                if 'group_size' in metadata['quantization_config']:
-                    model_settings['groupsize'] = metadata['quantization_config']['group_size']
-                if 'desc_act' in metadata['quantization_config']:
-                    model_settings['desc_act'] = metadata['quantization_config']['desc_act']
-
-        # Read AutoGPTQ metadata
-        path = Path(f'{shared.args.model_dir}/{model}/quantize_config.json')
-        if path.exists():
-            metadata = json.loads(open(path, 'r', encoding='utf-8').read())
-            if 'bits' in metadata:
-                model_settings['wbits'] = metadata['bits']
-            if 'group_size' in metadata:
-                model_settings['groupsize'] = metadata['group_size']
-            if 'desc_act' in metadata:
-                model_settings['desc_act'] = metadata['desc_act']
-
     # Try to find the Jinja instruct template
     path = Path(f'{shared.args.model_dir}/{model}') / 'tokenizer_config.json'
     if path.exists():
@@ -178,7 +155,7 @@ def infer_loader(model_name, model_settings):
     path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
     if not path_to_model.exists():
         loader = None
-    elif (path_to_model / 'quantize_config.json').exists() or ('wbits' in model_settings and isinstance(model_settings['wbits'], int) and model_settings['wbits'] > 0):
+    elif (path_to_model / 'quantize_config.json').exists():  # Old GPTQ metadata file
         loader = 'ExLlamav2_HF'
     elif len(list(path_to_model.glob('*.gguf'))) > 0 and path_to_model.is_dir() and (path_to_model / 'tokenizer_config.json').exists():
         loader = 'llamacpp_HF'
@@ -215,16 +192,11 @@ def update_model_parameters(state, initial=False):
         if initial and element in shared.provided_arguments:
             continue
 
-        # Setting null defaults
-        if element in ['wbits', 'groupsize'] and value == 'None':
-            value = vars(shared.args_defaults)[element]
-        elif element in ['cpu_memory'] and value == 0:
+        if element in ['cpu_memory'] and value == 0:
             value = vars(shared.args_defaults)[element]
 
         # Making some simple conversions
-        if element in ['wbits', 'groupsize']:
-            value = int(value)
-        elif element == 'cpu_memory' and value is not None:
+        if element == 'cpu_memory' and value is not None:
             value = f"{value}MiB"
 
         setattr(shared.args, element, value)
@@ -251,15 +223,12 @@ def apply_model_settings_to_state(model, state):
         loader = model_settings.pop('loader')
 
         # If the user is using an alternative loader for the same model type, let them keep using it
-        if not (loader == 'ExLlamav2_HF' and state['loader'] in ['ExLlamav2', 'AutoGPTQ']):
+        if not (loader == 'ExLlamav2_HF' and state['loader'] in ['ExLlamav2']):
             state['loader'] = loader
 
     for k in model_settings:
         if k in state:
-            if k in ['wbits', 'groupsize']:
-                state[k] = str(model_settings[k])
-            else:
-                state[k] = model_settings[k]
+            state[k] = model_settings[k]
 
     return state
 
diff --git a/modules/shared.py b/modules/shared.py
index 891c0556..6a83baae 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -86,7 +86,7 @@ group.add_argument('--idle-timeout', type=int, default=0, help='Unload model aft
 
 # Model loader
 group = parser.add_argument_group('Model loader')
-group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ.')
+group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2.')
 
 # Transformers/Accelerate
 group = parser.add_argument_group('Transformers/Accelerate')
@@ -147,17 +147,6 @@ group.add_argument('--no_sdpa', action='store_true', help='Force Torch SDPA to n
 group.add_argument('--num_experts_per_token', type=int, default=2, help='Number of experts to use for generation. Applies to MoE models like Mixtral.')
 group.add_argument('--enable_tp', action='store_true', help='Enable Tensor Parallelism (TP) in ExLlamaV2.')
 
-# AutoGPTQ
-group = parser.add_argument_group('AutoGPTQ')
-group.add_argument('--triton', action='store_true', help='Use triton.')
-group.add_argument('--no_inject_fused_mlp', action='store_true', help='Triton mode only: disable the use of fused MLP, which will use less VRAM at the cost of slower inference.')
-group.add_argument('--no_use_cuda_fp16', action='store_true', help='This can make models faster on some systems.')
-group.add_argument('--desc_act', action='store_true', help='For models that do not have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig.')
-group.add_argument('--disable_exllama', action='store_true', help='Disable ExLlama kernel, which can improve inference speed on some systems.')
-group.add_argument('--disable_exllamav2', action='store_true', help='Disable ExLlamav2 kernel.')
-group.add_argument('--wbits', type=int, default=0, help='Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.')
-group.add_argument('--groupsize', type=int, default=-1, help='Group size.')
-
 # HQQ
 group = parser.add_argument_group('HQQ')
 group.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.')
@@ -220,6 +209,14 @@ group.add_argument('--no_inject_fused_attention', action='store_true', help='DEP
 group.add_argument('--cache_4bit', action='store_true', help='DEPRECATED')
 group.add_argument('--cache_8bit', action='store_true', help='DEPRECATED')
 group.add_argument('--chat-buttons', action='store_true', help='DEPRECATED')
+group.add_argument('--triton', action='store_true', help='DEPRECATED')
+group.add_argument('--no_inject_fused_mlp', action='store_true', help='DEPRECATED')
+group.add_argument('--no_use_cuda_fp16', action='store_true', help='DEPRECATED')
+group.add_argument('--desc_act', action='store_true', help='DEPRECATED')
+group.add_argument('--disable_exllama', action='store_true', help='DEPRECATED')
+group.add_argument('--disable_exllamav2', action='store_true', help='DEPRECATED')
+group.add_argument('--wbits', type=int, default=0, help='DEPRECATED')
+group.add_argument('--groupsize', type=int, default=-1, help='DEPRECATED')
 
 args = parser.parse_args()
 args_defaults = parser.parse_args([])
@@ -262,8 +259,6 @@ def fix_loader_name(name):
         return 'llamacpp_HF'
     elif name in ['transformers', 'huggingface', 'hf', 'hugging_face', 'hugging face']:
         return 'Transformers'
-    elif name in ['autogptq', 'auto-gptq', 'auto_gptq', 'auto gptq']:
-        return 'AutoGPTQ'
     elif name in ['exllama', 'ex-llama', 'ex_llama', 'exlama']:
         return 'ExLlama'
     elif name in ['exllamav2', 'exllama-v2', 'ex_llama-v2', 'exlamav2', 'exlama-v2', 'exllama2', 'exllama-2']:
diff --git a/modules/ui.py b/modules/ui.py
index 30d4163c..e66de434 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -119,14 +119,6 @@ def list_model_elements():
         'compute_dtype',
         'quant_type',
         'use_double_quant',
-        'wbits',
-        'groupsize',
-        'triton',
-        'desc_act',
-        'no_inject_fused_mlp',
-        'no_use_cuda_fp16',
-        'disable_exllama',
-        'disable_exllamav2',
         'cfg_cache',
         'no_flash_attn',
         'no_xformers',
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index f2814401..eac0cba6 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -89,8 +89,6 @@ def create_ui():
                             shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
                             shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, step=1, value=shared.args.n_batch)
                             shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend)
-                            shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=shared.args.wbits if shared.args.wbits > 0 else "None")
-                            shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=shared.args.groupsize if shared.args.groupsize > 0 else "None")
                             shared.gradio['n_ctx'] = gr.Number(label="n_ctx", precision=0, step=256, value=shared.args.n_ctx, info='Context length. ⚠️ Lower this value if you can\'t load the model.')
                             shared.gradio['max_seq_len'] = gr.Number(label='max_seq_len', precision=0, step=256, value=shared.args.max_seq_len, info='Context length. ⚠️ Lower this value if you can\'t load the model.')
                             shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q6', 'q4'], value=shared.args.cache_type, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.')
@@ -121,10 +119,6 @@ def create_ui():
                             shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap)
                             shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
                             shared.gradio['numa'] = gr.Checkbox(label="numa", value=shared.args.numa, info='NUMA support can help on some systems with non-uniform memory access.')
-                            shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)
-                            shared.gradio['no_inject_fused_mlp'] = gr.Checkbox(label="no_inject_fused_mlp", value=shared.args.no_inject_fused_mlp, info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.')
-                            shared.gradio['no_use_cuda_fp16'] = gr.Checkbox(label="no_use_cuda_fp16", value=shared.args.no_use_cuda_fp16, info='This can make models faster on some systems.')
-                            shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.')
                             shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant, info='Used by load-in-4bit.')
                             shared.gradio['use_eager_attention'] = gr.Checkbox(label="use_eager_attention", value=shared.args.use_eager_attention, info='Set attn_implementation= eager while loading the model.')
                             shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16)
@@ -136,13 +130,10 @@ def create_ui():
                             shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.')
                             shared.gradio['cpp_runner'] = gr.Checkbox(label="cpp-runner", value=shared.args.cpp_runner, info='Enable inference with ModelRunnerCpp, which is faster than the default ModelRunner.')
                             shared.gradio['logits_all'] = gr.Checkbox(label="logits_all", value=shared.args.logits_all, info='Needs to be set for perplexity evaluation to work with this loader. Otherwise, ignore it, as it makes prompt processing slower.')
-                            shared.gradio['disable_exllama'] = gr.Checkbox(label="disable_exllama", value=shared.args.disable_exllama, info='Disable ExLlama kernel for GPTQ models.')
-                            shared.gradio['disable_exllamav2'] = gr.Checkbox(label="disable_exllamav2", value=shared.args.disable_exllamav2, info='Disable ExLlamav2 kernel for GPTQ models.')
                             shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code)
                             shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.')
                             shared.gradio['llamacpp_HF_info'] = gr.Markdown("llamacpp_HF loads llama.cpp as a Transformers model. To use it, you need to place your GGUF in a subfolder of models/ with the necessary tokenizer files.\n\nYou can use the \"llamacpp_HF creator\" menu to do that automatically.")
                             shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.")
-                            shared.gradio['autogptq_info'] = gr.Markdown('ExLlamav2_HF is recommended over AutoGPTQ for models derived from Llama.')
                             shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `max_seq_len` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.')
 
             with gr.Column():
diff --git a/one_click.py b/one_click.py
index 8fc1edf0..ca11efac 100644
--- a/one_click.py
+++ b/one_click.py
@@ -394,7 +394,7 @@ def update_requirements(initial_installation=False, pull=True):
         textgen_requirements = [
             req.replace('+cu121', '+cu118').replace('+cu122', '+cu118')
             for req in textgen_requirements
-            if "auto-gptq" not in req.lower() and "autoawq" not in req.lower()
+            if "autoawq" not in req.lower()
         ]
 
     if is_windows() and is_cuda118:  # No flash-attention on Windows for CUDA 11