Remove the AutoGPTQ loader (#6641)

This commit is contained in:
oobabooga 2025-01-08 19:28:56 -03:00 committed by GitHub
parent d3adcbf64b
commit 7157257c3f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 19 additions and 228 deletions

View File

@ -10,7 +10,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
## Features
- Supports multiple text generation backends in one UI/API, including [Transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp), and [ExLlamaV2](https://github.com/turboderp/exllamav2). [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [HQQ](https://github.com/mobiusml/hqq), and [AQLM](https://github.com/Vahe1994/AQLM) are also supported but you need to install them manually.
- Supports multiple text generation backends in a single UI/API, including [Transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp), and [ExLlamaV2](https://github.com/turboderp-org/exllamav2). [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) is supported via its own [Dockerfile](https://github.com/oobabooga/text-generation-webui/blob/main/docker/TensorRT-LLM/Dockerfile), and the Transformers loader is compatible with libraries like [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [HQQ](https://github.com/mobiusml/hqq), and [AQLM](https://github.com/Vahe1994/AQLM), but they must be installed manually.
- OpenAI-compatible API with Chat and Completions endpoints see [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples).
- Automatic prompt formatting using Jinja2 templates.
- Three chat modes: `instruct`, `chat-instruct`, and `chat`, with automatic prompt templates in `chat-instruct`.

View File

@ -1,74 +0,0 @@
from pathlib import Path
from accelerate.utils import is_xpu_available
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
import modules.shared as shared
from modules.logging_colors import logger
from modules.models import get_max_memory_dict
def load_quantized(model_name):
path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
pt_path = None
# Find the model checkpoint
if shared.args.checkpoint:
pt_path = Path(shared.args.checkpoint)
else:
for ext in ['.safetensors', '.pt', '.bin']:
found = list(path_to_model.glob(f"*{ext}"))
if len(found) > 0:
if len(found) > 1:
logger.warning(f'More than one {ext} model has been found. The last one will be selected. It could be wrong.')
pt_path = found[-1]
break
if pt_path is None:
logger.error("The model could not be loaded because its checkpoint file in .bin/.pt/.safetensors format could not be located.")
return
use_safetensors = pt_path.suffix == '.safetensors'
if not (path_to_model / "quantize_config.json").exists():
quantize_config = BaseQuantizeConfig(
bits=bits if (bits := shared.args.wbits) > 0 else 4,
group_size=gs if (gs := shared.args.groupsize) > 0 else -1,
desc_act=shared.args.desc_act
)
else:
quantize_config = None
# Define the params for AutoGPTQForCausalLM.from_quantized
params = {
'model_basename': pt_path.stem,
'device': "xpu:0" if is_xpu_available() else "cuda:0" if not shared.args.cpu else "cpu",
'use_triton': shared.args.triton,
'inject_fused_attention': False,
'inject_fused_mlp': not shared.args.no_inject_fused_mlp,
'use_safetensors': use_safetensors,
'trust_remote_code': shared.args.trust_remote_code,
'max_memory': get_max_memory_dict(),
'quantize_config': quantize_config,
'use_cuda_fp16': not shared.args.no_use_cuda_fp16,
'disable_exllama': shared.args.disable_exllama,
'disable_exllamav2': shared.args.disable_exllamav2,
}
logger.info(f"The AutoGPTQ params are: {params}")
model = AutoGPTQForCausalLM.from_quantized(path_to_model, **params)
# These lines fix the multimodal extension when used with AutoGPTQ
if hasattr(model, 'model'):
if not hasattr(model, 'dtype'):
if hasattr(model.model, 'dtype'):
model.dtype = model.model.dtype
if hasattr(model.model, 'model') and hasattr(model.model.model, 'embed_tokens'):
if not hasattr(model, 'embed_tokens'):
model.embed_tokens = model.model.model.embed_tokens
if not hasattr(model.model, 'embed_tokens'):
model.model.embed_tokens = model.model.model.embed_tokens
return model

View File

@ -2,13 +2,11 @@ from pathlib import Path
import modules.shared as shared
from modules.logging_colors import logger
from modules.models import get_device, reload_model
from modules.models import get_device
def add_lora_to_model(lora_names):
if 'GPTQForCausalLM' in shared.model.__class__.__name__ or shared.args.loader == 'AutoGPTQ':
add_lora_autogptq(lora_names)
elif shared.model.__class__.__name__ in ['Exllamav2Model', 'Exllamav2HF'] or shared.args.loader in ['ExLlamav2', 'ExLlamav2_HF']:
if shared.model.__class__.__name__ in ['Exllamav2Model', 'Exllamav2HF'] or shared.args.loader in ['ExLlamav2', 'ExLlamav2_HF']:
add_lora_exllamav2(lora_names)
else:
add_lora_transformers(lora_names)
@ -48,38 +46,6 @@ def add_lora_exllamav2(lora_names):
shared.model.loras = None
def add_lora_autogptq(lora_names):
'''
Adapted from https://github.com/Ph0rk0z/text-generation-webui-testing
'''
try:
from auto_gptq import get_gptq_peft_model
from auto_gptq.utils.peft_utils import GPTQLoraConfig
except:
logger.error("This version of AutoGPTQ does not support LoRA. You need to install from source or wait for a new release.")
return
if len(lora_names) == 0:
reload_model()
shared.lora_names = []
return
else:
if len(lora_names) > 1:
logger.warning('AutoGPTQ can only work with 1 LoRA at the moment. Only the first one in the list will be loaded.')
peft_config = GPTQLoraConfig(
inference_mode=True,
)
lora_path = get_lora_path(lora_names[0])
logger.info("Applying the following LoRAs to {}: {}".format(shared.model_name, ', '.join([lora_names[0]])))
shared.model = get_gptq_peft_model(shared.model, peft_config, lora_path)
shared.lora_names = [lora_names[0]]
return
def add_lora_transformers(lora_names):
from peft import PeftModel

View File

@ -25,8 +25,6 @@ loaders_and_params = OrderedDict({
'use_eager_attention',
'alpha_value',
'compress_pos_emb',
'disable_exllama',
'disable_exllamav2',
],
'llama.cpp': [
'n_ctx',
@ -107,24 +105,6 @@ loaders_and_params = OrderedDict({
'compress_pos_emb',
'exllamav2_info',
],
'AutoGPTQ': [
'triton',
'no_inject_fused_mlp',
'no_use_cuda_fp16',
'wbits',
'groupsize',
'desc_act',
'disable_exllama',
'disable_exllamav2',
'gpu_memory',
'cpu_memory',
'cpu',
'disk',
'auto_devices',
'trust_remote_code',
'no_use_fast',
'autogptq_info',
],
'HQQ': [
'hqq_backend',
'trust_remote_code',
@ -191,7 +171,6 @@ def transformers_samplers():
loaders_samplers = {
'Transformers': transformers_samplers(),
'AutoGPTQ': transformers_samplers(),
'HQQ': transformers_samplers(),
'ExLlamav2': {
'temperature',

View File

@ -3,7 +3,6 @@ import os
import pprint
import re
import time
import traceback
from pathlib import Path
import torch
@ -21,7 +20,6 @@ from transformers import (
AutoModelForSeq2SeqLM,
AutoTokenizer,
BitsAndBytesConfig,
GPTQConfig,
is_torch_npu_available,
is_torch_xpu_available
)
@ -73,7 +71,6 @@ def load_model(model_name, loader=None):
'llamacpp_HF': llamacpp_HF_loader,
'ExLlamav2': ExLlamav2_loader,
'ExLlamav2_HF': ExLlamav2_HF_loader,
'AutoGPTQ': AutoGPTQ_loader,
'HQQ': HQQ_loader,
'TensorRT-LLM': TensorRT_LLM_loader,
}
@ -164,7 +161,7 @@ def huggingface_loader(model_name):
LoaderClass = AutoModelForCausalLM
# Load the model without any special settings
if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.load_in_4bit, shared.args.auto_devices, shared.args.disk, shared.args.deepspeed, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.compress_pos_emb > 1, shared.args.alpha_value > 1, shared.args.disable_exllama, shared.args.disable_exllamav2]):
if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.load_in_4bit, shared.args.auto_devices, shared.args.disk, shared.args.deepspeed, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.compress_pos_emb > 1, shared.args.alpha_value > 1]):
logger.info("TRANSFORMERS_PARAMS=")
pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(params)
print()
@ -229,21 +226,6 @@ def huggingface_loader(model_name):
if shared.args.disk:
params['offload_folder'] = shared.args.disk_cache_dir
if shared.args.disable_exllama or shared.args.disable_exllamav2:
try:
gptq_config = GPTQConfig(
bits=config.quantization_config.get('bits', 4),
disable_exllama=shared.args.disable_exllama,
disable_exllamav2=shared.args.disable_exllamav2,
)
params['quantization_config'] = gptq_config
logger.info(f'Loading with disable_exllama={shared.args.disable_exllama} and disable_exllamav2={shared.args.disable_exllamav2}.')
except:
exc = traceback.format_exc()
logger.error('Failed to disable exllama. Does the config.json for this model contain the necessary quantization info?')
print(exc)
if shared.args.compress_pos_emb > 1:
params['rope_scaling'] = {'type': 'linear', 'factor': shared.args.compress_pos_emb}
elif shared.args.alpha_value > 1:
@ -310,15 +292,6 @@ def ExLlamav2_HF_loader(model_name):
return Exllamav2HF.from_pretrained(model_name)
def AutoGPTQ_loader(model_name):
try:
import modules.AutoGPTQ_loader
except ModuleNotFoundError:
raise ModuleNotFoundError("Failed to import 'autogptq'. Please install it manually following the instructions in the AutoGPTQ GitHub repository.")
return modules.AutoGPTQ_loader.load_quantized(model_name)
def HQQ_loader(model_name):
try:
from hqq.core.quantize import HQQBackend, HQQLinear

View File

@ -11,9 +11,6 @@ def get_fallback_settings():
return {
'bf16': False,
'use_eager_attention': False,
'wbits': 'None',
'groupsize': 'None',
'desc_act': False,
'max_seq_len': 2048,
'n_ctx': 2048,
'rope_freq_base': 0,
@ -111,26 +108,6 @@ def get_model_metadata(model):
if 'architectures' in metadata and isinstance(metadata['architectures'], list) and 'Gemma2ForCausalLM' in metadata['architectures']:
model_settings['use_eager_attention'] = True
# Read GPTQ metadata for old GPTQ loaders
if 'quantization_config' in metadata and metadata['quantization_config'].get('quant_method', '') != 'exl2':
if 'bits' in metadata['quantization_config']:
model_settings['wbits'] = metadata['quantization_config']['bits']
if 'group_size' in metadata['quantization_config']:
model_settings['groupsize'] = metadata['quantization_config']['group_size']
if 'desc_act' in metadata['quantization_config']:
model_settings['desc_act'] = metadata['quantization_config']['desc_act']
# Read AutoGPTQ metadata
path = Path(f'{shared.args.model_dir}/{model}/quantize_config.json')
if path.exists():
metadata = json.loads(open(path, 'r', encoding='utf-8').read())
if 'bits' in metadata:
model_settings['wbits'] = metadata['bits']
if 'group_size' in metadata:
model_settings['groupsize'] = metadata['group_size']
if 'desc_act' in metadata:
model_settings['desc_act'] = metadata['desc_act']
# Try to find the Jinja instruct template
path = Path(f'{shared.args.model_dir}/{model}') / 'tokenizer_config.json'
if path.exists():
@ -178,7 +155,7 @@ def infer_loader(model_name, model_settings):
path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
if not path_to_model.exists():
loader = None
elif (path_to_model / 'quantize_config.json').exists() or ('wbits' in model_settings and isinstance(model_settings['wbits'], int) and model_settings['wbits'] > 0):
elif (path_to_model / 'quantize_config.json').exists(): # Old GPTQ metadata file
loader = 'ExLlamav2_HF'
elif len(list(path_to_model.glob('*.gguf'))) > 0 and path_to_model.is_dir() and (path_to_model / 'tokenizer_config.json').exists():
loader = 'llamacpp_HF'
@ -215,16 +192,11 @@ def update_model_parameters(state, initial=False):
if initial and element in shared.provided_arguments:
continue
# Setting null defaults
if element in ['wbits', 'groupsize'] and value == 'None':
value = vars(shared.args_defaults)[element]
elif element in ['cpu_memory'] and value == 0:
if element in ['cpu_memory'] and value == 0:
value = vars(shared.args_defaults)[element]
# Making some simple conversions
if element in ['wbits', 'groupsize']:
value = int(value)
elif element == 'cpu_memory' and value is not None:
if element == 'cpu_memory' and value is not None:
value = f"{value}MiB"
setattr(shared.args, element, value)
@ -251,15 +223,12 @@ def apply_model_settings_to_state(model, state):
loader = model_settings.pop('loader')
# If the user is using an alternative loader for the same model type, let them keep using it
if not (loader == 'ExLlamav2_HF' and state['loader'] in ['ExLlamav2', 'AutoGPTQ']):
if not (loader == 'ExLlamav2_HF' and state['loader'] in ['ExLlamav2']):
state['loader'] = loader
for k in model_settings:
if k in state:
if k in ['wbits', 'groupsize']:
state[k] = str(model_settings[k])
else:
state[k] = model_settings[k]
state[k] = model_settings[k]
return state

View File

@ -86,7 +86,7 @@ group.add_argument('--idle-timeout', type=int, default=0, help='Unload model aft
# Model loader
group = parser.add_argument_group('Model loader')
group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ.')
group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2.')
# Transformers/Accelerate
group = parser.add_argument_group('Transformers/Accelerate')
@ -147,17 +147,6 @@ group.add_argument('--no_sdpa', action='store_true', help='Force Torch SDPA to n
group.add_argument('--num_experts_per_token', type=int, default=2, help='Number of experts to use for generation. Applies to MoE models like Mixtral.')
group.add_argument('--enable_tp', action='store_true', help='Enable Tensor Parallelism (TP) in ExLlamaV2.')
# AutoGPTQ
group = parser.add_argument_group('AutoGPTQ')
group.add_argument('--triton', action='store_true', help='Use triton.')
group.add_argument('--no_inject_fused_mlp', action='store_true', help='Triton mode only: disable the use of fused MLP, which will use less VRAM at the cost of slower inference.')
group.add_argument('--no_use_cuda_fp16', action='store_true', help='This can make models faster on some systems.')
group.add_argument('--desc_act', action='store_true', help='For models that do not have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig.')
group.add_argument('--disable_exllama', action='store_true', help='Disable ExLlama kernel, which can improve inference speed on some systems.')
group.add_argument('--disable_exllamav2', action='store_true', help='Disable ExLlamav2 kernel.')
group.add_argument('--wbits', type=int, default=0, help='Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.')
group.add_argument('--groupsize', type=int, default=-1, help='Group size.')
# HQQ
group = parser.add_argument_group('HQQ')
group.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.')
@ -220,6 +209,14 @@ group.add_argument('--no_inject_fused_attention', action='store_true', help='DEP
group.add_argument('--cache_4bit', action='store_true', help='DEPRECATED')
group.add_argument('--cache_8bit', action='store_true', help='DEPRECATED')
group.add_argument('--chat-buttons', action='store_true', help='DEPRECATED')
group.add_argument('--triton', action='store_true', help='DEPRECATED')
group.add_argument('--no_inject_fused_mlp', action='store_true', help='DEPRECATED')
group.add_argument('--no_use_cuda_fp16', action='store_true', help='DEPRECATED')
group.add_argument('--desc_act', action='store_true', help='DEPRECATED')
group.add_argument('--disable_exllama', action='store_true', help='DEPRECATED')
group.add_argument('--disable_exllamav2', action='store_true', help='DEPRECATED')
group.add_argument('--wbits', type=int, default=0, help='DEPRECATED')
group.add_argument('--groupsize', type=int, default=-1, help='DEPRECATED')
args = parser.parse_args()
args_defaults = parser.parse_args([])
@ -262,8 +259,6 @@ def fix_loader_name(name):
return 'llamacpp_HF'
elif name in ['transformers', 'huggingface', 'hf', 'hugging_face', 'hugging face']:
return 'Transformers'
elif name in ['autogptq', 'auto-gptq', 'auto_gptq', 'auto gptq']:
return 'AutoGPTQ'
elif name in ['exllama', 'ex-llama', 'ex_llama', 'exlama']:
return 'ExLlama'
elif name in ['exllamav2', 'exllama-v2', 'ex_llama-v2', 'exlamav2', 'exlama-v2', 'exllama2', 'exllama-2']:

View File

@ -119,14 +119,6 @@ def list_model_elements():
'compute_dtype',
'quant_type',
'use_double_quant',
'wbits',
'groupsize',
'triton',
'desc_act',
'no_inject_fused_mlp',
'no_use_cuda_fp16',
'disable_exllama',
'disable_exllamav2',
'cfg_cache',
'no_flash_attn',
'no_xformers',

View File

@ -89,8 +89,6 @@ def create_ui():
shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, step=1, value=shared.args.n_batch)
shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend)
shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=shared.args.wbits if shared.args.wbits > 0 else "None")
shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=shared.args.groupsize if shared.args.groupsize > 0 else "None")
shared.gradio['n_ctx'] = gr.Number(label="n_ctx", precision=0, step=256, value=shared.args.n_ctx, info='Context length. ⚠️ Lower this value if you can\'t load the model.')
shared.gradio['max_seq_len'] = gr.Number(label='max_seq_len', precision=0, step=256, value=shared.args.max_seq_len, info='Context length. ⚠️ Lower this value if you can\'t load the model.')
shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q6', 'q4'], value=shared.args.cache_type, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.')
@ -121,10 +119,6 @@ def create_ui():
shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap)
shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
shared.gradio['numa'] = gr.Checkbox(label="numa", value=shared.args.numa, info='NUMA support can help on some systems with non-uniform memory access.')
shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)
shared.gradio['no_inject_fused_mlp'] = gr.Checkbox(label="no_inject_fused_mlp", value=shared.args.no_inject_fused_mlp, info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.')
shared.gradio['no_use_cuda_fp16'] = gr.Checkbox(label="no_use_cuda_fp16", value=shared.args.no_use_cuda_fp16, info='This can make models faster on some systems.')
shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.')
shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant, info='Used by load-in-4bit.')
shared.gradio['use_eager_attention'] = gr.Checkbox(label="use_eager_attention", value=shared.args.use_eager_attention, info='Set attn_implementation= eager while loading the model.')
shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16)
@ -136,13 +130,10 @@ def create_ui():
shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.')
shared.gradio['cpp_runner'] = gr.Checkbox(label="cpp-runner", value=shared.args.cpp_runner, info='Enable inference with ModelRunnerCpp, which is faster than the default ModelRunner.')
shared.gradio['logits_all'] = gr.Checkbox(label="logits_all", value=shared.args.logits_all, info='Needs to be set for perplexity evaluation to work with this loader. Otherwise, ignore it, as it makes prompt processing slower.')
shared.gradio['disable_exllama'] = gr.Checkbox(label="disable_exllama", value=shared.args.disable_exllama, info='Disable ExLlama kernel for GPTQ models.')
shared.gradio['disable_exllamav2'] = gr.Checkbox(label="disable_exllamav2", value=shared.args.disable_exllamav2, info='Disable ExLlamav2 kernel for GPTQ models.')
shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code)
shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.')
shared.gradio['llamacpp_HF_info'] = gr.Markdown("llamacpp_HF loads llama.cpp as a Transformers model. To use it, you need to place your GGUF in a subfolder of models/ with the necessary tokenizer files.\n\nYou can use the \"llamacpp_HF creator\" menu to do that automatically.")
shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.")
shared.gradio['autogptq_info'] = gr.Markdown('ExLlamav2_HF is recommended over AutoGPTQ for models derived from Llama.')
shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `max_seq_len` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.')
with gr.Column():

View File

@ -394,7 +394,7 @@ def update_requirements(initial_installation=False, pull=True):
textgen_requirements = [
req.replace('+cu121', '+cu118').replace('+cu122', '+cu118')
for req in textgen_requirements
if "auto-gptq" not in req.lower() and "autoawq" not in req.lower()
if "autoawq" not in req.lower()
]
if is_windows() and is_cuda118: # No flash-attention on Windows for CUDA 11