mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2025-01-09 20:19:06 +01:00
Remove the AutoGPTQ loader (#6641)
This commit is contained in:
parent
d3adcbf64b
commit
7157257c3f
@ -10,7 +10,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
|
|||||||
|
|
||||||
## Features
|
## Features
|
||||||
|
|
||||||
- Supports multiple text generation backends in one UI/API, including [Transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp), and [ExLlamaV2](https://github.com/turboderp/exllamav2). [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [HQQ](https://github.com/mobiusml/hqq), and [AQLM](https://github.com/Vahe1994/AQLM) are also supported but you need to install them manually.
|
- Supports multiple text generation backends in a single UI/API, including [Transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp), and [ExLlamaV2](https://github.com/turboderp-org/exllamav2). [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) is supported via its own [Dockerfile](https://github.com/oobabooga/text-generation-webui/blob/main/docker/TensorRT-LLM/Dockerfile), and the Transformers loader is compatible with libraries like [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [HQQ](https://github.com/mobiusml/hqq), and [AQLM](https://github.com/Vahe1994/AQLM), but they must be installed manually.
|
||||||
- OpenAI-compatible API with Chat and Completions endpoints – see [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples).
|
- OpenAI-compatible API with Chat and Completions endpoints – see [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples).
|
||||||
- Automatic prompt formatting using Jinja2 templates.
|
- Automatic prompt formatting using Jinja2 templates.
|
||||||
- Three chat modes: `instruct`, `chat-instruct`, and `chat`, with automatic prompt templates in `chat-instruct`.
|
- Three chat modes: `instruct`, `chat-instruct`, and `chat`, with automatic prompt templates in `chat-instruct`.
|
||||||
|
@ -1,74 +0,0 @@
|
|||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from accelerate.utils import is_xpu_available
|
|
||||||
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
|
|
||||||
|
|
||||||
import modules.shared as shared
|
|
||||||
from modules.logging_colors import logger
|
|
||||||
from modules.models import get_max_memory_dict
|
|
||||||
|
|
||||||
|
|
||||||
def load_quantized(model_name):
|
|
||||||
path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
|
|
||||||
pt_path = None
|
|
||||||
|
|
||||||
# Find the model checkpoint
|
|
||||||
if shared.args.checkpoint:
|
|
||||||
pt_path = Path(shared.args.checkpoint)
|
|
||||||
else:
|
|
||||||
for ext in ['.safetensors', '.pt', '.bin']:
|
|
||||||
found = list(path_to_model.glob(f"*{ext}"))
|
|
||||||
if len(found) > 0:
|
|
||||||
if len(found) > 1:
|
|
||||||
logger.warning(f'More than one {ext} model has been found. The last one will be selected. It could be wrong.')
|
|
||||||
|
|
||||||
pt_path = found[-1]
|
|
||||||
break
|
|
||||||
|
|
||||||
if pt_path is None:
|
|
||||||
logger.error("The model could not be loaded because its checkpoint file in .bin/.pt/.safetensors format could not be located.")
|
|
||||||
return
|
|
||||||
|
|
||||||
use_safetensors = pt_path.suffix == '.safetensors'
|
|
||||||
if not (path_to_model / "quantize_config.json").exists():
|
|
||||||
quantize_config = BaseQuantizeConfig(
|
|
||||||
bits=bits if (bits := shared.args.wbits) > 0 else 4,
|
|
||||||
group_size=gs if (gs := shared.args.groupsize) > 0 else -1,
|
|
||||||
desc_act=shared.args.desc_act
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
quantize_config = None
|
|
||||||
|
|
||||||
# Define the params for AutoGPTQForCausalLM.from_quantized
|
|
||||||
params = {
|
|
||||||
'model_basename': pt_path.stem,
|
|
||||||
'device': "xpu:0" if is_xpu_available() else "cuda:0" if not shared.args.cpu else "cpu",
|
|
||||||
'use_triton': shared.args.triton,
|
|
||||||
'inject_fused_attention': False,
|
|
||||||
'inject_fused_mlp': not shared.args.no_inject_fused_mlp,
|
|
||||||
'use_safetensors': use_safetensors,
|
|
||||||
'trust_remote_code': shared.args.trust_remote_code,
|
|
||||||
'max_memory': get_max_memory_dict(),
|
|
||||||
'quantize_config': quantize_config,
|
|
||||||
'use_cuda_fp16': not shared.args.no_use_cuda_fp16,
|
|
||||||
'disable_exllama': shared.args.disable_exllama,
|
|
||||||
'disable_exllamav2': shared.args.disable_exllamav2,
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.info(f"The AutoGPTQ params are: {params}")
|
|
||||||
model = AutoGPTQForCausalLM.from_quantized(path_to_model, **params)
|
|
||||||
|
|
||||||
# These lines fix the multimodal extension when used with AutoGPTQ
|
|
||||||
if hasattr(model, 'model'):
|
|
||||||
if not hasattr(model, 'dtype'):
|
|
||||||
if hasattr(model.model, 'dtype'):
|
|
||||||
model.dtype = model.model.dtype
|
|
||||||
|
|
||||||
if hasattr(model.model, 'model') and hasattr(model.model.model, 'embed_tokens'):
|
|
||||||
if not hasattr(model, 'embed_tokens'):
|
|
||||||
model.embed_tokens = model.model.model.embed_tokens
|
|
||||||
|
|
||||||
if not hasattr(model.model, 'embed_tokens'):
|
|
||||||
model.model.embed_tokens = model.model.model.embed_tokens
|
|
||||||
|
|
||||||
return model
|
|
@ -2,13 +2,11 @@ from pathlib import Path
|
|||||||
|
|
||||||
import modules.shared as shared
|
import modules.shared as shared
|
||||||
from modules.logging_colors import logger
|
from modules.logging_colors import logger
|
||||||
from modules.models import get_device, reload_model
|
from modules.models import get_device
|
||||||
|
|
||||||
|
|
||||||
def add_lora_to_model(lora_names):
|
def add_lora_to_model(lora_names):
|
||||||
if 'GPTQForCausalLM' in shared.model.__class__.__name__ or shared.args.loader == 'AutoGPTQ':
|
if shared.model.__class__.__name__ in ['Exllamav2Model', 'Exllamav2HF'] or shared.args.loader in ['ExLlamav2', 'ExLlamav2_HF']:
|
||||||
add_lora_autogptq(lora_names)
|
|
||||||
elif shared.model.__class__.__name__ in ['Exllamav2Model', 'Exllamav2HF'] or shared.args.loader in ['ExLlamav2', 'ExLlamav2_HF']:
|
|
||||||
add_lora_exllamav2(lora_names)
|
add_lora_exllamav2(lora_names)
|
||||||
else:
|
else:
|
||||||
add_lora_transformers(lora_names)
|
add_lora_transformers(lora_names)
|
||||||
@ -48,38 +46,6 @@ def add_lora_exllamav2(lora_names):
|
|||||||
shared.model.loras = None
|
shared.model.loras = None
|
||||||
|
|
||||||
|
|
||||||
def add_lora_autogptq(lora_names):
|
|
||||||
'''
|
|
||||||
Adapted from https://github.com/Ph0rk0z/text-generation-webui-testing
|
|
||||||
'''
|
|
||||||
|
|
||||||
try:
|
|
||||||
from auto_gptq import get_gptq_peft_model
|
|
||||||
from auto_gptq.utils.peft_utils import GPTQLoraConfig
|
|
||||||
except:
|
|
||||||
logger.error("This version of AutoGPTQ does not support LoRA. You need to install from source or wait for a new release.")
|
|
||||||
return
|
|
||||||
|
|
||||||
if len(lora_names) == 0:
|
|
||||||
reload_model()
|
|
||||||
|
|
||||||
shared.lora_names = []
|
|
||||||
return
|
|
||||||
else:
|
|
||||||
if len(lora_names) > 1:
|
|
||||||
logger.warning('AutoGPTQ can only work with 1 LoRA at the moment. Only the first one in the list will be loaded.')
|
|
||||||
|
|
||||||
peft_config = GPTQLoraConfig(
|
|
||||||
inference_mode=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
lora_path = get_lora_path(lora_names[0])
|
|
||||||
logger.info("Applying the following LoRAs to {}: {}".format(shared.model_name, ', '.join([lora_names[0]])))
|
|
||||||
shared.model = get_gptq_peft_model(shared.model, peft_config, lora_path)
|
|
||||||
shared.lora_names = [lora_names[0]]
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
def add_lora_transformers(lora_names):
|
def add_lora_transformers(lora_names):
|
||||||
|
|
||||||
from peft import PeftModel
|
from peft import PeftModel
|
||||||
|
@ -25,8 +25,6 @@ loaders_and_params = OrderedDict({
|
|||||||
'use_eager_attention',
|
'use_eager_attention',
|
||||||
'alpha_value',
|
'alpha_value',
|
||||||
'compress_pos_emb',
|
'compress_pos_emb',
|
||||||
'disable_exllama',
|
|
||||||
'disable_exllamav2',
|
|
||||||
],
|
],
|
||||||
'llama.cpp': [
|
'llama.cpp': [
|
||||||
'n_ctx',
|
'n_ctx',
|
||||||
@ -107,24 +105,6 @@ loaders_and_params = OrderedDict({
|
|||||||
'compress_pos_emb',
|
'compress_pos_emb',
|
||||||
'exllamav2_info',
|
'exllamav2_info',
|
||||||
],
|
],
|
||||||
'AutoGPTQ': [
|
|
||||||
'triton',
|
|
||||||
'no_inject_fused_mlp',
|
|
||||||
'no_use_cuda_fp16',
|
|
||||||
'wbits',
|
|
||||||
'groupsize',
|
|
||||||
'desc_act',
|
|
||||||
'disable_exllama',
|
|
||||||
'disable_exllamav2',
|
|
||||||
'gpu_memory',
|
|
||||||
'cpu_memory',
|
|
||||||
'cpu',
|
|
||||||
'disk',
|
|
||||||
'auto_devices',
|
|
||||||
'trust_remote_code',
|
|
||||||
'no_use_fast',
|
|
||||||
'autogptq_info',
|
|
||||||
],
|
|
||||||
'HQQ': [
|
'HQQ': [
|
||||||
'hqq_backend',
|
'hqq_backend',
|
||||||
'trust_remote_code',
|
'trust_remote_code',
|
||||||
@ -191,7 +171,6 @@ def transformers_samplers():
|
|||||||
|
|
||||||
loaders_samplers = {
|
loaders_samplers = {
|
||||||
'Transformers': transformers_samplers(),
|
'Transformers': transformers_samplers(),
|
||||||
'AutoGPTQ': transformers_samplers(),
|
|
||||||
'HQQ': transformers_samplers(),
|
'HQQ': transformers_samplers(),
|
||||||
'ExLlamav2': {
|
'ExLlamav2': {
|
||||||
'temperature',
|
'temperature',
|
||||||
|
@ -3,7 +3,6 @@ import os
|
|||||||
import pprint
|
import pprint
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
import traceback
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
@ -21,7 +20,6 @@ from transformers import (
|
|||||||
AutoModelForSeq2SeqLM,
|
AutoModelForSeq2SeqLM,
|
||||||
AutoTokenizer,
|
AutoTokenizer,
|
||||||
BitsAndBytesConfig,
|
BitsAndBytesConfig,
|
||||||
GPTQConfig,
|
|
||||||
is_torch_npu_available,
|
is_torch_npu_available,
|
||||||
is_torch_xpu_available
|
is_torch_xpu_available
|
||||||
)
|
)
|
||||||
@ -73,7 +71,6 @@ def load_model(model_name, loader=None):
|
|||||||
'llamacpp_HF': llamacpp_HF_loader,
|
'llamacpp_HF': llamacpp_HF_loader,
|
||||||
'ExLlamav2': ExLlamav2_loader,
|
'ExLlamav2': ExLlamav2_loader,
|
||||||
'ExLlamav2_HF': ExLlamav2_HF_loader,
|
'ExLlamav2_HF': ExLlamav2_HF_loader,
|
||||||
'AutoGPTQ': AutoGPTQ_loader,
|
|
||||||
'HQQ': HQQ_loader,
|
'HQQ': HQQ_loader,
|
||||||
'TensorRT-LLM': TensorRT_LLM_loader,
|
'TensorRT-LLM': TensorRT_LLM_loader,
|
||||||
}
|
}
|
||||||
@ -164,7 +161,7 @@ def huggingface_loader(model_name):
|
|||||||
LoaderClass = AutoModelForCausalLM
|
LoaderClass = AutoModelForCausalLM
|
||||||
|
|
||||||
# Load the model without any special settings
|
# Load the model without any special settings
|
||||||
if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.load_in_4bit, shared.args.auto_devices, shared.args.disk, shared.args.deepspeed, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.compress_pos_emb > 1, shared.args.alpha_value > 1, shared.args.disable_exllama, shared.args.disable_exllamav2]):
|
if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.load_in_4bit, shared.args.auto_devices, shared.args.disk, shared.args.deepspeed, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.compress_pos_emb > 1, shared.args.alpha_value > 1]):
|
||||||
logger.info("TRANSFORMERS_PARAMS=")
|
logger.info("TRANSFORMERS_PARAMS=")
|
||||||
pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(params)
|
pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(params)
|
||||||
print()
|
print()
|
||||||
@ -229,21 +226,6 @@ def huggingface_loader(model_name):
|
|||||||
if shared.args.disk:
|
if shared.args.disk:
|
||||||
params['offload_folder'] = shared.args.disk_cache_dir
|
params['offload_folder'] = shared.args.disk_cache_dir
|
||||||
|
|
||||||
if shared.args.disable_exllama or shared.args.disable_exllamav2:
|
|
||||||
try:
|
|
||||||
gptq_config = GPTQConfig(
|
|
||||||
bits=config.quantization_config.get('bits', 4),
|
|
||||||
disable_exllama=shared.args.disable_exllama,
|
|
||||||
disable_exllamav2=shared.args.disable_exllamav2,
|
|
||||||
)
|
|
||||||
|
|
||||||
params['quantization_config'] = gptq_config
|
|
||||||
logger.info(f'Loading with disable_exllama={shared.args.disable_exllama} and disable_exllamav2={shared.args.disable_exllamav2}.')
|
|
||||||
except:
|
|
||||||
exc = traceback.format_exc()
|
|
||||||
logger.error('Failed to disable exllama. Does the config.json for this model contain the necessary quantization info?')
|
|
||||||
print(exc)
|
|
||||||
|
|
||||||
if shared.args.compress_pos_emb > 1:
|
if shared.args.compress_pos_emb > 1:
|
||||||
params['rope_scaling'] = {'type': 'linear', 'factor': shared.args.compress_pos_emb}
|
params['rope_scaling'] = {'type': 'linear', 'factor': shared.args.compress_pos_emb}
|
||||||
elif shared.args.alpha_value > 1:
|
elif shared.args.alpha_value > 1:
|
||||||
@ -310,15 +292,6 @@ def ExLlamav2_HF_loader(model_name):
|
|||||||
return Exllamav2HF.from_pretrained(model_name)
|
return Exllamav2HF.from_pretrained(model_name)
|
||||||
|
|
||||||
|
|
||||||
def AutoGPTQ_loader(model_name):
|
|
||||||
try:
|
|
||||||
import modules.AutoGPTQ_loader
|
|
||||||
except ModuleNotFoundError:
|
|
||||||
raise ModuleNotFoundError("Failed to import 'autogptq'. Please install it manually following the instructions in the AutoGPTQ GitHub repository.")
|
|
||||||
|
|
||||||
return modules.AutoGPTQ_loader.load_quantized(model_name)
|
|
||||||
|
|
||||||
|
|
||||||
def HQQ_loader(model_name):
|
def HQQ_loader(model_name):
|
||||||
try:
|
try:
|
||||||
from hqq.core.quantize import HQQBackend, HQQLinear
|
from hqq.core.quantize import HQQBackend, HQQLinear
|
||||||
|
@ -11,9 +11,6 @@ def get_fallback_settings():
|
|||||||
return {
|
return {
|
||||||
'bf16': False,
|
'bf16': False,
|
||||||
'use_eager_attention': False,
|
'use_eager_attention': False,
|
||||||
'wbits': 'None',
|
|
||||||
'groupsize': 'None',
|
|
||||||
'desc_act': False,
|
|
||||||
'max_seq_len': 2048,
|
'max_seq_len': 2048,
|
||||||
'n_ctx': 2048,
|
'n_ctx': 2048,
|
||||||
'rope_freq_base': 0,
|
'rope_freq_base': 0,
|
||||||
@ -111,26 +108,6 @@ def get_model_metadata(model):
|
|||||||
if 'architectures' in metadata and isinstance(metadata['architectures'], list) and 'Gemma2ForCausalLM' in metadata['architectures']:
|
if 'architectures' in metadata and isinstance(metadata['architectures'], list) and 'Gemma2ForCausalLM' in metadata['architectures']:
|
||||||
model_settings['use_eager_attention'] = True
|
model_settings['use_eager_attention'] = True
|
||||||
|
|
||||||
# Read GPTQ metadata for old GPTQ loaders
|
|
||||||
if 'quantization_config' in metadata and metadata['quantization_config'].get('quant_method', '') != 'exl2':
|
|
||||||
if 'bits' in metadata['quantization_config']:
|
|
||||||
model_settings['wbits'] = metadata['quantization_config']['bits']
|
|
||||||
if 'group_size' in metadata['quantization_config']:
|
|
||||||
model_settings['groupsize'] = metadata['quantization_config']['group_size']
|
|
||||||
if 'desc_act' in metadata['quantization_config']:
|
|
||||||
model_settings['desc_act'] = metadata['quantization_config']['desc_act']
|
|
||||||
|
|
||||||
# Read AutoGPTQ metadata
|
|
||||||
path = Path(f'{shared.args.model_dir}/{model}/quantize_config.json')
|
|
||||||
if path.exists():
|
|
||||||
metadata = json.loads(open(path, 'r', encoding='utf-8').read())
|
|
||||||
if 'bits' in metadata:
|
|
||||||
model_settings['wbits'] = metadata['bits']
|
|
||||||
if 'group_size' in metadata:
|
|
||||||
model_settings['groupsize'] = metadata['group_size']
|
|
||||||
if 'desc_act' in metadata:
|
|
||||||
model_settings['desc_act'] = metadata['desc_act']
|
|
||||||
|
|
||||||
# Try to find the Jinja instruct template
|
# Try to find the Jinja instruct template
|
||||||
path = Path(f'{shared.args.model_dir}/{model}') / 'tokenizer_config.json'
|
path = Path(f'{shared.args.model_dir}/{model}') / 'tokenizer_config.json'
|
||||||
if path.exists():
|
if path.exists():
|
||||||
@ -178,7 +155,7 @@ def infer_loader(model_name, model_settings):
|
|||||||
path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
|
path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
|
||||||
if not path_to_model.exists():
|
if not path_to_model.exists():
|
||||||
loader = None
|
loader = None
|
||||||
elif (path_to_model / 'quantize_config.json').exists() or ('wbits' in model_settings and isinstance(model_settings['wbits'], int) and model_settings['wbits'] > 0):
|
elif (path_to_model / 'quantize_config.json').exists(): # Old GPTQ metadata file
|
||||||
loader = 'ExLlamav2_HF'
|
loader = 'ExLlamav2_HF'
|
||||||
elif len(list(path_to_model.glob('*.gguf'))) > 0 and path_to_model.is_dir() and (path_to_model / 'tokenizer_config.json').exists():
|
elif len(list(path_to_model.glob('*.gguf'))) > 0 and path_to_model.is_dir() and (path_to_model / 'tokenizer_config.json').exists():
|
||||||
loader = 'llamacpp_HF'
|
loader = 'llamacpp_HF'
|
||||||
@ -215,16 +192,11 @@ def update_model_parameters(state, initial=False):
|
|||||||
if initial and element in shared.provided_arguments:
|
if initial and element in shared.provided_arguments:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Setting null defaults
|
if element in ['cpu_memory'] and value == 0:
|
||||||
if element in ['wbits', 'groupsize'] and value == 'None':
|
|
||||||
value = vars(shared.args_defaults)[element]
|
|
||||||
elif element in ['cpu_memory'] and value == 0:
|
|
||||||
value = vars(shared.args_defaults)[element]
|
value = vars(shared.args_defaults)[element]
|
||||||
|
|
||||||
# Making some simple conversions
|
# Making some simple conversions
|
||||||
if element in ['wbits', 'groupsize']:
|
if element == 'cpu_memory' and value is not None:
|
||||||
value = int(value)
|
|
||||||
elif element == 'cpu_memory' and value is not None:
|
|
||||||
value = f"{value}MiB"
|
value = f"{value}MiB"
|
||||||
|
|
||||||
setattr(shared.args, element, value)
|
setattr(shared.args, element, value)
|
||||||
@ -251,15 +223,12 @@ def apply_model_settings_to_state(model, state):
|
|||||||
loader = model_settings.pop('loader')
|
loader = model_settings.pop('loader')
|
||||||
|
|
||||||
# If the user is using an alternative loader for the same model type, let them keep using it
|
# If the user is using an alternative loader for the same model type, let them keep using it
|
||||||
if not (loader == 'ExLlamav2_HF' and state['loader'] in ['ExLlamav2', 'AutoGPTQ']):
|
if not (loader == 'ExLlamav2_HF' and state['loader'] in ['ExLlamav2']):
|
||||||
state['loader'] = loader
|
state['loader'] = loader
|
||||||
|
|
||||||
for k in model_settings:
|
for k in model_settings:
|
||||||
if k in state:
|
if k in state:
|
||||||
if k in ['wbits', 'groupsize']:
|
state[k] = model_settings[k]
|
||||||
state[k] = str(model_settings[k])
|
|
||||||
else:
|
|
||||||
state[k] = model_settings[k]
|
|
||||||
|
|
||||||
return state
|
return state
|
||||||
|
|
||||||
|
@ -86,7 +86,7 @@ group.add_argument('--idle-timeout', type=int, default=0, help='Unload model aft
|
|||||||
|
|
||||||
# Model loader
|
# Model loader
|
||||||
group = parser.add_argument_group('Model loader')
|
group = parser.add_argument_group('Model loader')
|
||||||
group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ.')
|
group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2.')
|
||||||
|
|
||||||
# Transformers/Accelerate
|
# Transformers/Accelerate
|
||||||
group = parser.add_argument_group('Transformers/Accelerate')
|
group = parser.add_argument_group('Transformers/Accelerate')
|
||||||
@ -147,17 +147,6 @@ group.add_argument('--no_sdpa', action='store_true', help='Force Torch SDPA to n
|
|||||||
group.add_argument('--num_experts_per_token', type=int, default=2, help='Number of experts to use for generation. Applies to MoE models like Mixtral.')
|
group.add_argument('--num_experts_per_token', type=int, default=2, help='Number of experts to use for generation. Applies to MoE models like Mixtral.')
|
||||||
group.add_argument('--enable_tp', action='store_true', help='Enable Tensor Parallelism (TP) in ExLlamaV2.')
|
group.add_argument('--enable_tp', action='store_true', help='Enable Tensor Parallelism (TP) in ExLlamaV2.')
|
||||||
|
|
||||||
# AutoGPTQ
|
|
||||||
group = parser.add_argument_group('AutoGPTQ')
|
|
||||||
group.add_argument('--triton', action='store_true', help='Use triton.')
|
|
||||||
group.add_argument('--no_inject_fused_mlp', action='store_true', help='Triton mode only: disable the use of fused MLP, which will use less VRAM at the cost of slower inference.')
|
|
||||||
group.add_argument('--no_use_cuda_fp16', action='store_true', help='This can make models faster on some systems.')
|
|
||||||
group.add_argument('--desc_act', action='store_true', help='For models that do not have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig.')
|
|
||||||
group.add_argument('--disable_exllama', action='store_true', help='Disable ExLlama kernel, which can improve inference speed on some systems.')
|
|
||||||
group.add_argument('--disable_exllamav2', action='store_true', help='Disable ExLlamav2 kernel.')
|
|
||||||
group.add_argument('--wbits', type=int, default=0, help='Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.')
|
|
||||||
group.add_argument('--groupsize', type=int, default=-1, help='Group size.')
|
|
||||||
|
|
||||||
# HQQ
|
# HQQ
|
||||||
group = parser.add_argument_group('HQQ')
|
group = parser.add_argument_group('HQQ')
|
||||||
group.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.')
|
group.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.')
|
||||||
@ -220,6 +209,14 @@ group.add_argument('--no_inject_fused_attention', action='store_true', help='DEP
|
|||||||
group.add_argument('--cache_4bit', action='store_true', help='DEPRECATED')
|
group.add_argument('--cache_4bit', action='store_true', help='DEPRECATED')
|
||||||
group.add_argument('--cache_8bit', action='store_true', help='DEPRECATED')
|
group.add_argument('--cache_8bit', action='store_true', help='DEPRECATED')
|
||||||
group.add_argument('--chat-buttons', action='store_true', help='DEPRECATED')
|
group.add_argument('--chat-buttons', action='store_true', help='DEPRECATED')
|
||||||
|
group.add_argument('--triton', action='store_true', help='DEPRECATED')
|
||||||
|
group.add_argument('--no_inject_fused_mlp', action='store_true', help='DEPRECATED')
|
||||||
|
group.add_argument('--no_use_cuda_fp16', action='store_true', help='DEPRECATED')
|
||||||
|
group.add_argument('--desc_act', action='store_true', help='DEPRECATED')
|
||||||
|
group.add_argument('--disable_exllama', action='store_true', help='DEPRECATED')
|
||||||
|
group.add_argument('--disable_exllamav2', action='store_true', help='DEPRECATED')
|
||||||
|
group.add_argument('--wbits', type=int, default=0, help='DEPRECATED')
|
||||||
|
group.add_argument('--groupsize', type=int, default=-1, help='DEPRECATED')
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
args_defaults = parser.parse_args([])
|
args_defaults = parser.parse_args([])
|
||||||
@ -262,8 +259,6 @@ def fix_loader_name(name):
|
|||||||
return 'llamacpp_HF'
|
return 'llamacpp_HF'
|
||||||
elif name in ['transformers', 'huggingface', 'hf', 'hugging_face', 'hugging face']:
|
elif name in ['transformers', 'huggingface', 'hf', 'hugging_face', 'hugging face']:
|
||||||
return 'Transformers'
|
return 'Transformers'
|
||||||
elif name in ['autogptq', 'auto-gptq', 'auto_gptq', 'auto gptq']:
|
|
||||||
return 'AutoGPTQ'
|
|
||||||
elif name in ['exllama', 'ex-llama', 'ex_llama', 'exlama']:
|
elif name in ['exllama', 'ex-llama', 'ex_llama', 'exlama']:
|
||||||
return 'ExLlama'
|
return 'ExLlama'
|
||||||
elif name in ['exllamav2', 'exllama-v2', 'ex_llama-v2', 'exlamav2', 'exlama-v2', 'exllama2', 'exllama-2']:
|
elif name in ['exllamav2', 'exllama-v2', 'ex_llama-v2', 'exlamav2', 'exlama-v2', 'exllama2', 'exllama-2']:
|
||||||
|
@ -119,14 +119,6 @@ def list_model_elements():
|
|||||||
'compute_dtype',
|
'compute_dtype',
|
||||||
'quant_type',
|
'quant_type',
|
||||||
'use_double_quant',
|
'use_double_quant',
|
||||||
'wbits',
|
|
||||||
'groupsize',
|
|
||||||
'triton',
|
|
||||||
'desc_act',
|
|
||||||
'no_inject_fused_mlp',
|
|
||||||
'no_use_cuda_fp16',
|
|
||||||
'disable_exllama',
|
|
||||||
'disable_exllamav2',
|
|
||||||
'cfg_cache',
|
'cfg_cache',
|
||||||
'no_flash_attn',
|
'no_flash_attn',
|
||||||
'no_xformers',
|
'no_xformers',
|
||||||
|
@ -89,8 +89,6 @@ def create_ui():
|
|||||||
shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
|
shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
|
||||||
shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, step=1, value=shared.args.n_batch)
|
shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, step=1, value=shared.args.n_batch)
|
||||||
shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend)
|
shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend)
|
||||||
shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=shared.args.wbits if shared.args.wbits > 0 else "None")
|
|
||||||
shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=shared.args.groupsize if shared.args.groupsize > 0 else "None")
|
|
||||||
shared.gradio['n_ctx'] = gr.Number(label="n_ctx", precision=0, step=256, value=shared.args.n_ctx, info='Context length. ⚠️ Lower this value if you can\'t load the model.')
|
shared.gradio['n_ctx'] = gr.Number(label="n_ctx", precision=0, step=256, value=shared.args.n_ctx, info='Context length. ⚠️ Lower this value if you can\'t load the model.')
|
||||||
shared.gradio['max_seq_len'] = gr.Number(label='max_seq_len', precision=0, step=256, value=shared.args.max_seq_len, info='Context length. ⚠️ Lower this value if you can\'t load the model.')
|
shared.gradio['max_seq_len'] = gr.Number(label='max_seq_len', precision=0, step=256, value=shared.args.max_seq_len, info='Context length. ⚠️ Lower this value if you can\'t load the model.')
|
||||||
shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q6', 'q4'], value=shared.args.cache_type, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.')
|
shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q6', 'q4'], value=shared.args.cache_type, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.')
|
||||||
@ -121,10 +119,6 @@ def create_ui():
|
|||||||
shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap)
|
shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap)
|
||||||
shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
|
shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
|
||||||
shared.gradio['numa'] = gr.Checkbox(label="numa", value=shared.args.numa, info='NUMA support can help on some systems with non-uniform memory access.')
|
shared.gradio['numa'] = gr.Checkbox(label="numa", value=shared.args.numa, info='NUMA support can help on some systems with non-uniform memory access.')
|
||||||
shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)
|
|
||||||
shared.gradio['no_inject_fused_mlp'] = gr.Checkbox(label="no_inject_fused_mlp", value=shared.args.no_inject_fused_mlp, info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.')
|
|
||||||
shared.gradio['no_use_cuda_fp16'] = gr.Checkbox(label="no_use_cuda_fp16", value=shared.args.no_use_cuda_fp16, info='This can make models faster on some systems.')
|
|
||||||
shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.')
|
|
||||||
shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant, info='Used by load-in-4bit.')
|
shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant, info='Used by load-in-4bit.')
|
||||||
shared.gradio['use_eager_attention'] = gr.Checkbox(label="use_eager_attention", value=shared.args.use_eager_attention, info='Set attn_implementation= eager while loading the model.')
|
shared.gradio['use_eager_attention'] = gr.Checkbox(label="use_eager_attention", value=shared.args.use_eager_attention, info='Set attn_implementation= eager while loading the model.')
|
||||||
shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16)
|
shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16)
|
||||||
@ -136,13 +130,10 @@ def create_ui():
|
|||||||
shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.')
|
shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.')
|
||||||
shared.gradio['cpp_runner'] = gr.Checkbox(label="cpp-runner", value=shared.args.cpp_runner, info='Enable inference with ModelRunnerCpp, which is faster than the default ModelRunner.')
|
shared.gradio['cpp_runner'] = gr.Checkbox(label="cpp-runner", value=shared.args.cpp_runner, info='Enable inference with ModelRunnerCpp, which is faster than the default ModelRunner.')
|
||||||
shared.gradio['logits_all'] = gr.Checkbox(label="logits_all", value=shared.args.logits_all, info='Needs to be set for perplexity evaluation to work with this loader. Otherwise, ignore it, as it makes prompt processing slower.')
|
shared.gradio['logits_all'] = gr.Checkbox(label="logits_all", value=shared.args.logits_all, info='Needs to be set for perplexity evaluation to work with this loader. Otherwise, ignore it, as it makes prompt processing slower.')
|
||||||
shared.gradio['disable_exllama'] = gr.Checkbox(label="disable_exllama", value=shared.args.disable_exllama, info='Disable ExLlama kernel for GPTQ models.')
|
|
||||||
shared.gradio['disable_exllamav2'] = gr.Checkbox(label="disable_exllamav2", value=shared.args.disable_exllamav2, info='Disable ExLlamav2 kernel for GPTQ models.')
|
|
||||||
shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code)
|
shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code)
|
||||||
shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.')
|
shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.')
|
||||||
shared.gradio['llamacpp_HF_info'] = gr.Markdown("llamacpp_HF loads llama.cpp as a Transformers model. To use it, you need to place your GGUF in a subfolder of models/ with the necessary tokenizer files.\n\nYou can use the \"llamacpp_HF creator\" menu to do that automatically.")
|
shared.gradio['llamacpp_HF_info'] = gr.Markdown("llamacpp_HF loads llama.cpp as a Transformers model. To use it, you need to place your GGUF in a subfolder of models/ with the necessary tokenizer files.\n\nYou can use the \"llamacpp_HF creator\" menu to do that automatically.")
|
||||||
shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.")
|
shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.")
|
||||||
shared.gradio['autogptq_info'] = gr.Markdown('ExLlamav2_HF is recommended over AutoGPTQ for models derived from Llama.')
|
|
||||||
shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `max_seq_len` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.')
|
shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `max_seq_len` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.')
|
||||||
|
|
||||||
with gr.Column():
|
with gr.Column():
|
||||||
|
@ -394,7 +394,7 @@ def update_requirements(initial_installation=False, pull=True):
|
|||||||
textgen_requirements = [
|
textgen_requirements = [
|
||||||
req.replace('+cu121', '+cu118').replace('+cu122', '+cu118')
|
req.replace('+cu121', '+cu118').replace('+cu122', '+cu118')
|
||||||
for req in textgen_requirements
|
for req in textgen_requirements
|
||||||
if "auto-gptq" not in req.lower() and "autoawq" not in req.lower()
|
if "autoawq" not in req.lower()
|
||||||
]
|
]
|
||||||
|
|
||||||
if is_windows() and is_cuda118: # No flash-attention on Windows for CUDA 11
|
if is_windows() and is_cuda118: # No flash-attention on Windows for CUDA 11
|
||||||
|
Loading…
x
Reference in New Issue
Block a user