mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2024-11-24 08:56:52 +01:00
Remove AutoAWQ as a standalone loader
(it works better through transformers)
This commit is contained in:
parent
f66ab63d64
commit
e6181e834a
@ -72,8 +72,6 @@ def add_lora_autogptq(lora_names):
|
||||
else:
|
||||
if len(lora_names) > 1:
|
||||
logger.warning('AutoGPTQ can only work with 1 LoRA at the moment. Only the first one in the list will be loaded.')
|
||||
if not shared.args.no_inject_fused_attention:
|
||||
logger.warning('Fused Attention + AutoGPTQ may break Lora loading. Disable it.')
|
||||
|
||||
peft_config = GPTQLoraConfig(
|
||||
inference_mode=True,
|
||||
|
@ -127,15 +127,6 @@ loaders_and_params = OrderedDict({
|
||||
'no_use_fast',
|
||||
'autogptq_info',
|
||||
],
|
||||
'AutoAWQ': [
|
||||
'cpu_memory',
|
||||
'gpu_memory',
|
||||
'auto_devices',
|
||||
'max_seq_len',
|
||||
'no_inject_fused_attention',
|
||||
'trust_remote_code',
|
||||
'no_use_fast',
|
||||
],
|
||||
'HQQ': [
|
||||
'hqq_backend',
|
||||
'trust_remote_code',
|
||||
@ -200,7 +191,6 @@ def transformers_samplers():
|
||||
loaders_samplers = {
|
||||
'Transformers': transformers_samplers(),
|
||||
'AutoGPTQ': transformers_samplers(),
|
||||
'AutoAWQ': transformers_samplers(),
|
||||
'HQQ': transformers_samplers(),
|
||||
'ExLlamav2': {
|
||||
'temperature',
|
||||
|
@ -75,7 +75,6 @@ def load_model(model_name, loader=None):
|
||||
'llamacpp_HF': llamacpp_HF_loader,
|
||||
'ExLlamav2': ExLlamav2_loader,
|
||||
'ExLlamav2_HF': ExLlamav2_HF_loader,
|
||||
'AutoAWQ': AutoAWQ_loader,
|
||||
'HQQ': HQQ_loader,
|
||||
'TensorRT-LLM': TensorRT_LLM_loader,
|
||||
}
|
||||
@ -292,24 +291,6 @@ def llamacpp_HF_loader(model_name):
|
||||
return model
|
||||
|
||||
|
||||
def AutoAWQ_loader(model_name):
|
||||
from awq import AutoAWQForCausalLM
|
||||
|
||||
model_dir = Path(f'{shared.args.model_dir}/{model_name}')
|
||||
|
||||
model = AutoAWQForCausalLM.from_quantized(
|
||||
quant_path=model_dir,
|
||||
max_new_tokens=shared.args.max_seq_len,
|
||||
trust_remote_code=shared.args.trust_remote_code,
|
||||
fuse_layers=not shared.args.no_inject_fused_attention,
|
||||
max_memory=get_max_memory_dict(),
|
||||
batch_size=1,
|
||||
safetensors=any(model_dir.glob('*.safetensors')),
|
||||
)
|
||||
|
||||
return model
|
||||
|
||||
|
||||
def AutoGPTQ_loader(model_name):
|
||||
import modules.AutoGPTQ_loader
|
||||
|
||||
|
@ -180,8 +180,6 @@ def infer_loader(model_name, model_settings):
|
||||
loader = None
|
||||
elif (path_to_model / 'quantize_config.json').exists() or ('wbits' in model_settings and isinstance(model_settings['wbits'], int) and model_settings['wbits'] > 0):
|
||||
loader = 'ExLlamav2_HF'
|
||||
elif (path_to_model / 'quant_config.json').exists() or re.match(r'.*-awq', model_name.lower()):
|
||||
loader = 'AutoAWQ'
|
||||
elif len(list(path_to_model.glob('*.gguf'))) > 0 and path_to_model.is_dir() and (path_to_model / 'tokenizer_config.json').exists():
|
||||
loader = 'llamacpp_HF'
|
||||
elif len(list(path_to_model.glob('*.gguf'))) > 0:
|
||||
|
@ -89,7 +89,7 @@ group.add_argument('--idle-timeout', type=int, default=0, help='Unload model aft
|
||||
|
||||
# Model loader
|
||||
group = parser.add_argument_group('Model loader')
|
||||
group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ, AutoAWQ.')
|
||||
group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ.')
|
||||
|
||||
# Transformers/Accelerate
|
||||
group = parser.add_argument_group('Transformers/Accelerate')
|
||||
@ -160,10 +160,6 @@ group.add_argument('--disable_exllamav2', action='store_true', help='Disable ExL
|
||||
group.add_argument('--wbits', type=int, default=0, help='Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.')
|
||||
group.add_argument('--groupsize', type=int, default=-1, help='Group size.')
|
||||
|
||||
# AutoAWQ
|
||||
group = parser.add_argument_group('AutoAWQ')
|
||||
group.add_argument('--no_inject_fused_attention', action='store_true', help='Disable the use of fused attention, which will use less VRAM at the cost of slower inference.')
|
||||
|
||||
# HQQ
|
||||
group = parser.add_argument_group('HQQ')
|
||||
group.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.')
|
||||
@ -217,6 +213,7 @@ group.add_argument('--model_type', type=str, help='DEPRECATED')
|
||||
group.add_argument('--pre_layer', type=int, nargs='+', help='DEPRECATED')
|
||||
group.add_argument('--checkpoint', type=str, help='DEPRECATED')
|
||||
group.add_argument('--monkey-patch', action='store_true', help='DEPRECATED')
|
||||
group.add_argument('--no_inject_fused_attention', action='store_true', help='DEPRECATED')
|
||||
|
||||
args = parser.parse_args()
|
||||
args_defaults = parser.parse_args([])
|
||||
@ -267,8 +264,6 @@ def fix_loader_name(name):
|
||||
return 'ExLlamav2'
|
||||
elif name in ['exllamav2-hf', 'exllamav2_hf', 'exllama-v2-hf', 'exllama_v2_hf', 'exllama-v2_hf', 'exllama2-hf', 'exllama2_hf', 'exllama-2-hf', 'exllama_2_hf', 'exllama-2_hf']:
|
||||
return 'ExLlamav2_HF'
|
||||
elif name in ['autoawq', 'awq', 'auto-awq']:
|
||||
return 'AutoAWQ'
|
||||
elif name in ['hqq']:
|
||||
return 'HQQ'
|
||||
elif name in ['tensorrt', 'tensorrtllm', 'tensorrt_llm', 'tensorrt-llm', 'tensort', 'tensortllm']:
|
||||
|
@ -78,7 +78,6 @@ def list_model_elements():
|
||||
'groupsize',
|
||||
'triton',
|
||||
'desc_act',
|
||||
'no_inject_fused_attention',
|
||||
'no_inject_fused_mlp',
|
||||
'no_use_cuda_fp16',
|
||||
'disable_exllama',
|
||||
|
@ -127,7 +127,6 @@ def create_ui():
|
||||
shared.gradio['no_offload_kqv'] = gr.Checkbox(label="no_offload_kqv", value=shared.args.no_offload_kqv, info='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.')
|
||||
shared.gradio['no_mul_mat_q'] = gr.Checkbox(label="no_mul_mat_q", value=shared.args.no_mul_mat_q, info='Disable the mulmat kernels.')
|
||||
shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)
|
||||
shared.gradio['no_inject_fused_attention'] = gr.Checkbox(label="no_inject_fused_attention", value=shared.args.no_inject_fused_attention, info='Disable fused attention. Fused attention improves inference performance but uses more VRAM. Fuses layers for AutoAWQ. Disable if running low on VRAM.')
|
||||
shared.gradio['no_inject_fused_mlp'] = gr.Checkbox(label="no_inject_fused_mlp", value=shared.args.no_inject_fused_mlp, info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.')
|
||||
shared.gradio['no_use_cuda_fp16'] = gr.Checkbox(label="no_use_cuda_fp16", value=shared.args.no_use_cuda_fp16, info='This can make models faster on some systems.')
|
||||
shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.')
|
||||
|
Loading…
Reference in New Issue
Block a user