From 7157257c3f4691b9e4b56ceddb8c428802ac4c54 Mon Sep 17 00:00:00 2001 From: oobabooga Date: Wed, 8 Jan 2025 19:28:56 -0300 Subject: [PATCH] Remove the AutoGPTQ loader (#6641) --- README.md | 2 +- modules/AutoGPTQ_loader.py | 74 -------------------------------------- modules/LoRA.py | 38 ++------------------ modules/loaders.py | 21 ----------- modules/models.py | 29 +-------------- modules/models_settings.py | 41 +++------------------ modules/shared.py | 23 +++++------- modules/ui.py | 8 ----- modules/ui_model_menu.py | 9 ----- one_click.py | 2 +- 10 files changed, 19 insertions(+), 228 deletions(-) delete mode 100644 modules/AutoGPTQ_loader.py diff --git a/README.md b/README.md index 0e16aa30..bec686ad 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github. ## Features -- Supports multiple text generation backends in one UI/API, including [Transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp), and [ExLlamaV2](https://github.com/turboderp/exllamav2). [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [HQQ](https://github.com/mobiusml/hqq), and [AQLM](https://github.com/Vahe1994/AQLM) are also supported but you need to install them manually. +- Supports multiple text generation backends in a single UI/API, including [Transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp), and [ExLlamaV2](https://github.com/turboderp-org/exllamav2). [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) is supported via its own [Dockerfile](https://github.com/oobabooga/text-generation-webui/blob/main/docker/TensorRT-LLM/Dockerfile), and the Transformers loader is compatible with libraries like [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [HQQ](https://github.com/mobiusml/hqq), and [AQLM](https://github.com/Vahe1994/AQLM), but they must be installed manually. - OpenAI-compatible API with Chat and Completions endpoints – see [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples). - Automatic prompt formatting using Jinja2 templates. - Three chat modes: `instruct`, `chat-instruct`, and `chat`, with automatic prompt templates in `chat-instruct`. diff --git a/modules/AutoGPTQ_loader.py b/modules/AutoGPTQ_loader.py deleted file mode 100644 index 69e8f299..00000000 --- a/modules/AutoGPTQ_loader.py +++ /dev/null @@ -1,74 +0,0 @@ -from pathlib import Path - -from accelerate.utils import is_xpu_available -from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig - -import modules.shared as shared -from modules.logging_colors import logger -from modules.models import get_max_memory_dict - - -def load_quantized(model_name): - path_to_model = Path(f'{shared.args.model_dir}/{model_name}') - pt_path = None - - # Find the model checkpoint - if shared.args.checkpoint: - pt_path = Path(shared.args.checkpoint) - else: - for ext in ['.safetensors', '.pt', '.bin']: - found = list(path_to_model.glob(f"*{ext}")) - if len(found) > 0: - if len(found) > 1: - logger.warning(f'More than one {ext} model has been found. The last one will be selected. It could be wrong.') - - pt_path = found[-1] - break - - if pt_path is None: - logger.error("The model could not be loaded because its checkpoint file in .bin/.pt/.safetensors format could not be located.") - return - - use_safetensors = pt_path.suffix == '.safetensors' - if not (path_to_model / "quantize_config.json").exists(): - quantize_config = BaseQuantizeConfig( - bits=bits if (bits := shared.args.wbits) > 0 else 4, - group_size=gs if (gs := shared.args.groupsize) > 0 else -1, - desc_act=shared.args.desc_act - ) - else: - quantize_config = None - - # Define the params for AutoGPTQForCausalLM.from_quantized - params = { - 'model_basename': pt_path.stem, - 'device': "xpu:0" if is_xpu_available() else "cuda:0" if not shared.args.cpu else "cpu", - 'use_triton': shared.args.triton, - 'inject_fused_attention': False, - 'inject_fused_mlp': not shared.args.no_inject_fused_mlp, - 'use_safetensors': use_safetensors, - 'trust_remote_code': shared.args.trust_remote_code, - 'max_memory': get_max_memory_dict(), - 'quantize_config': quantize_config, - 'use_cuda_fp16': not shared.args.no_use_cuda_fp16, - 'disable_exllama': shared.args.disable_exllama, - 'disable_exllamav2': shared.args.disable_exllamav2, - } - - logger.info(f"The AutoGPTQ params are: {params}") - model = AutoGPTQForCausalLM.from_quantized(path_to_model, **params) - - # These lines fix the multimodal extension when used with AutoGPTQ - if hasattr(model, 'model'): - if not hasattr(model, 'dtype'): - if hasattr(model.model, 'dtype'): - model.dtype = model.model.dtype - - if hasattr(model.model, 'model') and hasattr(model.model.model, 'embed_tokens'): - if not hasattr(model, 'embed_tokens'): - model.embed_tokens = model.model.model.embed_tokens - - if not hasattr(model.model, 'embed_tokens'): - model.model.embed_tokens = model.model.model.embed_tokens - - return model diff --git a/modules/LoRA.py b/modules/LoRA.py index e1ad01d7..1f4883e2 100644 --- a/modules/LoRA.py +++ b/modules/LoRA.py @@ -2,13 +2,11 @@ from pathlib import Path import modules.shared as shared from modules.logging_colors import logger -from modules.models import get_device, reload_model +from modules.models import get_device def add_lora_to_model(lora_names): - if 'GPTQForCausalLM' in shared.model.__class__.__name__ or shared.args.loader == 'AutoGPTQ': - add_lora_autogptq(lora_names) - elif shared.model.__class__.__name__ in ['Exllamav2Model', 'Exllamav2HF'] or shared.args.loader in ['ExLlamav2', 'ExLlamav2_HF']: + if shared.model.__class__.__name__ in ['Exllamav2Model', 'Exllamav2HF'] or shared.args.loader in ['ExLlamav2', 'ExLlamav2_HF']: add_lora_exllamav2(lora_names) else: add_lora_transformers(lora_names) @@ -48,38 +46,6 @@ def add_lora_exllamav2(lora_names): shared.model.loras = None -def add_lora_autogptq(lora_names): - ''' - Adapted from https://github.com/Ph0rk0z/text-generation-webui-testing - ''' - - try: - from auto_gptq import get_gptq_peft_model - from auto_gptq.utils.peft_utils import GPTQLoraConfig - except: - logger.error("This version of AutoGPTQ does not support LoRA. You need to install from source or wait for a new release.") - return - - if len(lora_names) == 0: - reload_model() - - shared.lora_names = [] - return - else: - if len(lora_names) > 1: - logger.warning('AutoGPTQ can only work with 1 LoRA at the moment. Only the first one in the list will be loaded.') - - peft_config = GPTQLoraConfig( - inference_mode=True, - ) - - lora_path = get_lora_path(lora_names[0]) - logger.info("Applying the following LoRAs to {}: {}".format(shared.model_name, ', '.join([lora_names[0]]))) - shared.model = get_gptq_peft_model(shared.model, peft_config, lora_path) - shared.lora_names = [lora_names[0]] - return - - def add_lora_transformers(lora_names): from peft import PeftModel diff --git a/modules/loaders.py b/modules/loaders.py index 191126b3..4e331dbb 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -25,8 +25,6 @@ loaders_and_params = OrderedDict({ 'use_eager_attention', 'alpha_value', 'compress_pos_emb', - 'disable_exllama', - 'disable_exllamav2', ], 'llama.cpp': [ 'n_ctx', @@ -107,24 +105,6 @@ loaders_and_params = OrderedDict({ 'compress_pos_emb', 'exllamav2_info', ], - 'AutoGPTQ': [ - 'triton', - 'no_inject_fused_mlp', - 'no_use_cuda_fp16', - 'wbits', - 'groupsize', - 'desc_act', - 'disable_exllama', - 'disable_exllamav2', - 'gpu_memory', - 'cpu_memory', - 'cpu', - 'disk', - 'auto_devices', - 'trust_remote_code', - 'no_use_fast', - 'autogptq_info', - ], 'HQQ': [ 'hqq_backend', 'trust_remote_code', @@ -191,7 +171,6 @@ def transformers_samplers(): loaders_samplers = { 'Transformers': transformers_samplers(), - 'AutoGPTQ': transformers_samplers(), 'HQQ': transformers_samplers(), 'ExLlamav2': { 'temperature', diff --git a/modules/models.py b/modules/models.py index cb1ba218..9c58b279 100644 --- a/modules/models.py +++ b/modules/models.py @@ -3,7 +3,6 @@ import os import pprint import re import time -import traceback from pathlib import Path import torch @@ -21,7 +20,6 @@ from transformers import ( AutoModelForSeq2SeqLM, AutoTokenizer, BitsAndBytesConfig, - GPTQConfig, is_torch_npu_available, is_torch_xpu_available ) @@ -73,7 +71,6 @@ def load_model(model_name, loader=None): 'llamacpp_HF': llamacpp_HF_loader, 'ExLlamav2': ExLlamav2_loader, 'ExLlamav2_HF': ExLlamav2_HF_loader, - 'AutoGPTQ': AutoGPTQ_loader, 'HQQ': HQQ_loader, 'TensorRT-LLM': TensorRT_LLM_loader, } @@ -164,7 +161,7 @@ def huggingface_loader(model_name): LoaderClass = AutoModelForCausalLM # Load the model without any special settings - if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.load_in_4bit, shared.args.auto_devices, shared.args.disk, shared.args.deepspeed, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.compress_pos_emb > 1, shared.args.alpha_value > 1, shared.args.disable_exllama, shared.args.disable_exllamav2]): + if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.load_in_4bit, shared.args.auto_devices, shared.args.disk, shared.args.deepspeed, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.compress_pos_emb > 1, shared.args.alpha_value > 1]): logger.info("TRANSFORMERS_PARAMS=") pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(params) print() @@ -229,21 +226,6 @@ def huggingface_loader(model_name): if shared.args.disk: params['offload_folder'] = shared.args.disk_cache_dir - if shared.args.disable_exllama or shared.args.disable_exllamav2: - try: - gptq_config = GPTQConfig( - bits=config.quantization_config.get('bits', 4), - disable_exllama=shared.args.disable_exllama, - disable_exllamav2=shared.args.disable_exllamav2, - ) - - params['quantization_config'] = gptq_config - logger.info(f'Loading with disable_exllama={shared.args.disable_exllama} and disable_exllamav2={shared.args.disable_exllamav2}.') - except: - exc = traceback.format_exc() - logger.error('Failed to disable exllama. Does the config.json for this model contain the necessary quantization info?') - print(exc) - if shared.args.compress_pos_emb > 1: params['rope_scaling'] = {'type': 'linear', 'factor': shared.args.compress_pos_emb} elif shared.args.alpha_value > 1: @@ -310,15 +292,6 @@ def ExLlamav2_HF_loader(model_name): return Exllamav2HF.from_pretrained(model_name) -def AutoGPTQ_loader(model_name): - try: - import modules.AutoGPTQ_loader - except ModuleNotFoundError: - raise ModuleNotFoundError("Failed to import 'autogptq'. Please install it manually following the instructions in the AutoGPTQ GitHub repository.") - - return modules.AutoGPTQ_loader.load_quantized(model_name) - - def HQQ_loader(model_name): try: from hqq.core.quantize import HQQBackend, HQQLinear diff --git a/modules/models_settings.py b/modules/models_settings.py index 1bb00ceb..8d658523 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -11,9 +11,6 @@ def get_fallback_settings(): return { 'bf16': False, 'use_eager_attention': False, - 'wbits': 'None', - 'groupsize': 'None', - 'desc_act': False, 'max_seq_len': 2048, 'n_ctx': 2048, 'rope_freq_base': 0, @@ -111,26 +108,6 @@ def get_model_metadata(model): if 'architectures' in metadata and isinstance(metadata['architectures'], list) and 'Gemma2ForCausalLM' in metadata['architectures']: model_settings['use_eager_attention'] = True - # Read GPTQ metadata for old GPTQ loaders - if 'quantization_config' in metadata and metadata['quantization_config'].get('quant_method', '') != 'exl2': - if 'bits' in metadata['quantization_config']: - model_settings['wbits'] = metadata['quantization_config']['bits'] - if 'group_size' in metadata['quantization_config']: - model_settings['groupsize'] = metadata['quantization_config']['group_size'] - if 'desc_act' in metadata['quantization_config']: - model_settings['desc_act'] = metadata['quantization_config']['desc_act'] - - # Read AutoGPTQ metadata - path = Path(f'{shared.args.model_dir}/{model}/quantize_config.json') - if path.exists(): - metadata = json.loads(open(path, 'r', encoding='utf-8').read()) - if 'bits' in metadata: - model_settings['wbits'] = metadata['bits'] - if 'group_size' in metadata: - model_settings['groupsize'] = metadata['group_size'] - if 'desc_act' in metadata: - model_settings['desc_act'] = metadata['desc_act'] - # Try to find the Jinja instruct template path = Path(f'{shared.args.model_dir}/{model}') / 'tokenizer_config.json' if path.exists(): @@ -178,7 +155,7 @@ def infer_loader(model_name, model_settings): path_to_model = Path(f'{shared.args.model_dir}/{model_name}') if not path_to_model.exists(): loader = None - elif (path_to_model / 'quantize_config.json').exists() or ('wbits' in model_settings and isinstance(model_settings['wbits'], int) and model_settings['wbits'] > 0): + elif (path_to_model / 'quantize_config.json').exists(): # Old GPTQ metadata file loader = 'ExLlamav2_HF' elif len(list(path_to_model.glob('*.gguf'))) > 0 and path_to_model.is_dir() and (path_to_model / 'tokenizer_config.json').exists(): loader = 'llamacpp_HF' @@ -215,16 +192,11 @@ def update_model_parameters(state, initial=False): if initial and element in shared.provided_arguments: continue - # Setting null defaults - if element in ['wbits', 'groupsize'] and value == 'None': - value = vars(shared.args_defaults)[element] - elif element in ['cpu_memory'] and value == 0: + if element in ['cpu_memory'] and value == 0: value = vars(shared.args_defaults)[element] # Making some simple conversions - if element in ['wbits', 'groupsize']: - value = int(value) - elif element == 'cpu_memory' and value is not None: + if element == 'cpu_memory' and value is not None: value = f"{value}MiB" setattr(shared.args, element, value) @@ -251,15 +223,12 @@ def apply_model_settings_to_state(model, state): loader = model_settings.pop('loader') # If the user is using an alternative loader for the same model type, let them keep using it - if not (loader == 'ExLlamav2_HF' and state['loader'] in ['ExLlamav2', 'AutoGPTQ']): + if not (loader == 'ExLlamav2_HF' and state['loader'] in ['ExLlamav2']): state['loader'] = loader for k in model_settings: if k in state: - if k in ['wbits', 'groupsize']: - state[k] = str(model_settings[k]) - else: - state[k] = model_settings[k] + state[k] = model_settings[k] return state diff --git a/modules/shared.py b/modules/shared.py index 891c0556..6a83baae 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -86,7 +86,7 @@ group.add_argument('--idle-timeout', type=int, default=0, help='Unload model aft # Model loader group = parser.add_argument_group('Model loader') -group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ.') +group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2.') # Transformers/Accelerate group = parser.add_argument_group('Transformers/Accelerate') @@ -147,17 +147,6 @@ group.add_argument('--no_sdpa', action='store_true', help='Force Torch SDPA to n group.add_argument('--num_experts_per_token', type=int, default=2, help='Number of experts to use for generation. Applies to MoE models like Mixtral.') group.add_argument('--enable_tp', action='store_true', help='Enable Tensor Parallelism (TP) in ExLlamaV2.') -# AutoGPTQ -group = parser.add_argument_group('AutoGPTQ') -group.add_argument('--triton', action='store_true', help='Use triton.') -group.add_argument('--no_inject_fused_mlp', action='store_true', help='Triton mode only: disable the use of fused MLP, which will use less VRAM at the cost of slower inference.') -group.add_argument('--no_use_cuda_fp16', action='store_true', help='This can make models faster on some systems.') -group.add_argument('--desc_act', action='store_true', help='For models that do not have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig.') -group.add_argument('--disable_exllama', action='store_true', help='Disable ExLlama kernel, which can improve inference speed on some systems.') -group.add_argument('--disable_exllamav2', action='store_true', help='Disable ExLlamav2 kernel.') -group.add_argument('--wbits', type=int, default=0, help='Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.') -group.add_argument('--groupsize', type=int, default=-1, help='Group size.') - # HQQ group = parser.add_argument_group('HQQ') group.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.') @@ -220,6 +209,14 @@ group.add_argument('--no_inject_fused_attention', action='store_true', help='DEP group.add_argument('--cache_4bit', action='store_true', help='DEPRECATED') group.add_argument('--cache_8bit', action='store_true', help='DEPRECATED') group.add_argument('--chat-buttons', action='store_true', help='DEPRECATED') +group.add_argument('--triton', action='store_true', help='DEPRECATED') +group.add_argument('--no_inject_fused_mlp', action='store_true', help='DEPRECATED') +group.add_argument('--no_use_cuda_fp16', action='store_true', help='DEPRECATED') +group.add_argument('--desc_act', action='store_true', help='DEPRECATED') +group.add_argument('--disable_exllama', action='store_true', help='DEPRECATED') +group.add_argument('--disable_exllamav2', action='store_true', help='DEPRECATED') +group.add_argument('--wbits', type=int, default=0, help='DEPRECATED') +group.add_argument('--groupsize', type=int, default=-1, help='DEPRECATED') args = parser.parse_args() args_defaults = parser.parse_args([]) @@ -262,8 +259,6 @@ def fix_loader_name(name): return 'llamacpp_HF' elif name in ['transformers', 'huggingface', 'hf', 'hugging_face', 'hugging face']: return 'Transformers' - elif name in ['autogptq', 'auto-gptq', 'auto_gptq', 'auto gptq']: - return 'AutoGPTQ' elif name in ['exllama', 'ex-llama', 'ex_llama', 'exlama']: return 'ExLlama' elif name in ['exllamav2', 'exllama-v2', 'ex_llama-v2', 'exlamav2', 'exlama-v2', 'exllama2', 'exllama-2']: diff --git a/modules/ui.py b/modules/ui.py index 30d4163c..e66de434 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -119,14 +119,6 @@ def list_model_elements(): 'compute_dtype', 'quant_type', 'use_double_quant', - 'wbits', - 'groupsize', - 'triton', - 'desc_act', - 'no_inject_fused_mlp', - 'no_use_cuda_fp16', - 'disable_exllama', - 'disable_exllamav2', 'cfg_cache', 'no_flash_attn', 'no_xformers', diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index f2814401..eac0cba6 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -89,8 +89,6 @@ def create_ui(): shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch) shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, step=1, value=shared.args.n_batch) shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend) - shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=shared.args.wbits if shared.args.wbits > 0 else "None") - shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=shared.args.groupsize if shared.args.groupsize > 0 else "None") shared.gradio['n_ctx'] = gr.Number(label="n_ctx", precision=0, step=256, value=shared.args.n_ctx, info='Context length. ⚠️ Lower this value if you can\'t load the model.') shared.gradio['max_seq_len'] = gr.Number(label='max_seq_len', precision=0, step=256, value=shared.args.max_seq_len, info='Context length. ⚠️ Lower this value if you can\'t load the model.') shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q6', 'q4'], value=shared.args.cache_type, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.') @@ -121,10 +119,6 @@ def create_ui(): shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap) shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock) shared.gradio['numa'] = gr.Checkbox(label="numa", value=shared.args.numa, info='NUMA support can help on some systems with non-uniform memory access.') - shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton) - shared.gradio['no_inject_fused_mlp'] = gr.Checkbox(label="no_inject_fused_mlp", value=shared.args.no_inject_fused_mlp, info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.') - shared.gradio['no_use_cuda_fp16'] = gr.Checkbox(label="no_use_cuda_fp16", value=shared.args.no_use_cuda_fp16, info='This can make models faster on some systems.') - shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.') shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant, info='Used by load-in-4bit.') shared.gradio['use_eager_attention'] = gr.Checkbox(label="use_eager_attention", value=shared.args.use_eager_attention, info='Set attn_implementation= eager while loading the model.') shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16) @@ -136,13 +130,10 @@ def create_ui(): shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.') shared.gradio['cpp_runner'] = gr.Checkbox(label="cpp-runner", value=shared.args.cpp_runner, info='Enable inference with ModelRunnerCpp, which is faster than the default ModelRunner.') shared.gradio['logits_all'] = gr.Checkbox(label="logits_all", value=shared.args.logits_all, info='Needs to be set for perplexity evaluation to work with this loader. Otherwise, ignore it, as it makes prompt processing slower.') - shared.gradio['disable_exllama'] = gr.Checkbox(label="disable_exllama", value=shared.args.disable_exllama, info='Disable ExLlama kernel for GPTQ models.') - shared.gradio['disable_exllamav2'] = gr.Checkbox(label="disable_exllamav2", value=shared.args.disable_exllamav2, info='Disable ExLlamav2 kernel for GPTQ models.') shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code) shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.') shared.gradio['llamacpp_HF_info'] = gr.Markdown("llamacpp_HF loads llama.cpp as a Transformers model. To use it, you need to place your GGUF in a subfolder of models/ with the necessary tokenizer files.\n\nYou can use the \"llamacpp_HF creator\" menu to do that automatically.") shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.") - shared.gradio['autogptq_info'] = gr.Markdown('ExLlamav2_HF is recommended over AutoGPTQ for models derived from Llama.') shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `max_seq_len` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.') with gr.Column(): diff --git a/one_click.py b/one_click.py index 8fc1edf0..ca11efac 100644 --- a/one_click.py +++ b/one_click.py @@ -394,7 +394,7 @@ def update_requirements(initial_installation=False, pull=True): textgen_requirements = [ req.replace('+cu121', '+cu118').replace('+cu122', '+cu118') for req in textgen_requirements - if "auto-gptq" not in req.lower() and "autoawq" not in req.lower() + if "autoawq" not in req.lower() ] if is_windows() and is_cuda118: # No flash-attention on Windows for CUDA 11