mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2024-11-22 08:07:56 +01:00
Remove AutoAWQ as a standalone loader
(it works better through transformers)
This commit is contained in:
parent
f66ab63d64
commit
e6181e834a
@ -72,8 +72,6 @@ def add_lora_autogptq(lora_names):
|
|||||||
else:
|
else:
|
||||||
if len(lora_names) > 1:
|
if len(lora_names) > 1:
|
||||||
logger.warning('AutoGPTQ can only work with 1 LoRA at the moment. Only the first one in the list will be loaded.')
|
logger.warning('AutoGPTQ can only work with 1 LoRA at the moment. Only the first one in the list will be loaded.')
|
||||||
if not shared.args.no_inject_fused_attention:
|
|
||||||
logger.warning('Fused Attention + AutoGPTQ may break Lora loading. Disable it.')
|
|
||||||
|
|
||||||
peft_config = GPTQLoraConfig(
|
peft_config = GPTQLoraConfig(
|
||||||
inference_mode=True,
|
inference_mode=True,
|
||||||
|
@ -127,15 +127,6 @@ loaders_and_params = OrderedDict({
|
|||||||
'no_use_fast',
|
'no_use_fast',
|
||||||
'autogptq_info',
|
'autogptq_info',
|
||||||
],
|
],
|
||||||
'AutoAWQ': [
|
|
||||||
'cpu_memory',
|
|
||||||
'gpu_memory',
|
|
||||||
'auto_devices',
|
|
||||||
'max_seq_len',
|
|
||||||
'no_inject_fused_attention',
|
|
||||||
'trust_remote_code',
|
|
||||||
'no_use_fast',
|
|
||||||
],
|
|
||||||
'HQQ': [
|
'HQQ': [
|
||||||
'hqq_backend',
|
'hqq_backend',
|
||||||
'trust_remote_code',
|
'trust_remote_code',
|
||||||
@ -200,7 +191,6 @@ def transformers_samplers():
|
|||||||
loaders_samplers = {
|
loaders_samplers = {
|
||||||
'Transformers': transformers_samplers(),
|
'Transformers': transformers_samplers(),
|
||||||
'AutoGPTQ': transformers_samplers(),
|
'AutoGPTQ': transformers_samplers(),
|
||||||
'AutoAWQ': transformers_samplers(),
|
|
||||||
'HQQ': transformers_samplers(),
|
'HQQ': transformers_samplers(),
|
||||||
'ExLlamav2': {
|
'ExLlamav2': {
|
||||||
'temperature',
|
'temperature',
|
||||||
|
@ -75,7 +75,6 @@ def load_model(model_name, loader=None):
|
|||||||
'llamacpp_HF': llamacpp_HF_loader,
|
'llamacpp_HF': llamacpp_HF_loader,
|
||||||
'ExLlamav2': ExLlamav2_loader,
|
'ExLlamav2': ExLlamav2_loader,
|
||||||
'ExLlamav2_HF': ExLlamav2_HF_loader,
|
'ExLlamav2_HF': ExLlamav2_HF_loader,
|
||||||
'AutoAWQ': AutoAWQ_loader,
|
|
||||||
'HQQ': HQQ_loader,
|
'HQQ': HQQ_loader,
|
||||||
'TensorRT-LLM': TensorRT_LLM_loader,
|
'TensorRT-LLM': TensorRT_LLM_loader,
|
||||||
}
|
}
|
||||||
@ -292,24 +291,6 @@ def llamacpp_HF_loader(model_name):
|
|||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
def AutoAWQ_loader(model_name):
|
|
||||||
from awq import AutoAWQForCausalLM
|
|
||||||
|
|
||||||
model_dir = Path(f'{shared.args.model_dir}/{model_name}')
|
|
||||||
|
|
||||||
model = AutoAWQForCausalLM.from_quantized(
|
|
||||||
quant_path=model_dir,
|
|
||||||
max_new_tokens=shared.args.max_seq_len,
|
|
||||||
trust_remote_code=shared.args.trust_remote_code,
|
|
||||||
fuse_layers=not shared.args.no_inject_fused_attention,
|
|
||||||
max_memory=get_max_memory_dict(),
|
|
||||||
batch_size=1,
|
|
||||||
safetensors=any(model_dir.glob('*.safetensors')),
|
|
||||||
)
|
|
||||||
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
def AutoGPTQ_loader(model_name):
|
def AutoGPTQ_loader(model_name):
|
||||||
import modules.AutoGPTQ_loader
|
import modules.AutoGPTQ_loader
|
||||||
|
|
||||||
|
@ -180,8 +180,6 @@ def infer_loader(model_name, model_settings):
|
|||||||
loader = None
|
loader = None
|
||||||
elif (path_to_model / 'quantize_config.json').exists() or ('wbits' in model_settings and isinstance(model_settings['wbits'], int) and model_settings['wbits'] > 0):
|
elif (path_to_model / 'quantize_config.json').exists() or ('wbits' in model_settings and isinstance(model_settings['wbits'], int) and model_settings['wbits'] > 0):
|
||||||
loader = 'ExLlamav2_HF'
|
loader = 'ExLlamav2_HF'
|
||||||
elif (path_to_model / 'quant_config.json').exists() or re.match(r'.*-awq', model_name.lower()):
|
|
||||||
loader = 'AutoAWQ'
|
|
||||||
elif len(list(path_to_model.glob('*.gguf'))) > 0 and path_to_model.is_dir() and (path_to_model / 'tokenizer_config.json').exists():
|
elif len(list(path_to_model.glob('*.gguf'))) > 0 and path_to_model.is_dir() and (path_to_model / 'tokenizer_config.json').exists():
|
||||||
loader = 'llamacpp_HF'
|
loader = 'llamacpp_HF'
|
||||||
elif len(list(path_to_model.glob('*.gguf'))) > 0:
|
elif len(list(path_to_model.glob('*.gguf'))) > 0:
|
||||||
|
@ -89,7 +89,7 @@ group.add_argument('--idle-timeout', type=int, default=0, help='Unload model aft
|
|||||||
|
|
||||||
# Model loader
|
# Model loader
|
||||||
group = parser.add_argument_group('Model loader')
|
group = parser.add_argument_group('Model loader')
|
||||||
group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ, AutoAWQ.')
|
group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ.')
|
||||||
|
|
||||||
# Transformers/Accelerate
|
# Transformers/Accelerate
|
||||||
group = parser.add_argument_group('Transformers/Accelerate')
|
group = parser.add_argument_group('Transformers/Accelerate')
|
||||||
@ -160,10 +160,6 @@ group.add_argument('--disable_exllamav2', action='store_true', help='Disable ExL
|
|||||||
group.add_argument('--wbits', type=int, default=0, help='Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.')
|
group.add_argument('--wbits', type=int, default=0, help='Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.')
|
||||||
group.add_argument('--groupsize', type=int, default=-1, help='Group size.')
|
group.add_argument('--groupsize', type=int, default=-1, help='Group size.')
|
||||||
|
|
||||||
# AutoAWQ
|
|
||||||
group = parser.add_argument_group('AutoAWQ')
|
|
||||||
group.add_argument('--no_inject_fused_attention', action='store_true', help='Disable the use of fused attention, which will use less VRAM at the cost of slower inference.')
|
|
||||||
|
|
||||||
# HQQ
|
# HQQ
|
||||||
group = parser.add_argument_group('HQQ')
|
group = parser.add_argument_group('HQQ')
|
||||||
group.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.')
|
group.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.')
|
||||||
@ -217,6 +213,7 @@ group.add_argument('--model_type', type=str, help='DEPRECATED')
|
|||||||
group.add_argument('--pre_layer', type=int, nargs='+', help='DEPRECATED')
|
group.add_argument('--pre_layer', type=int, nargs='+', help='DEPRECATED')
|
||||||
group.add_argument('--checkpoint', type=str, help='DEPRECATED')
|
group.add_argument('--checkpoint', type=str, help='DEPRECATED')
|
||||||
group.add_argument('--monkey-patch', action='store_true', help='DEPRECATED')
|
group.add_argument('--monkey-patch', action='store_true', help='DEPRECATED')
|
||||||
|
group.add_argument('--no_inject_fused_attention', action='store_true', help='DEPRECATED')
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
args_defaults = parser.parse_args([])
|
args_defaults = parser.parse_args([])
|
||||||
@ -267,8 +264,6 @@ def fix_loader_name(name):
|
|||||||
return 'ExLlamav2'
|
return 'ExLlamav2'
|
||||||
elif name in ['exllamav2-hf', 'exllamav2_hf', 'exllama-v2-hf', 'exllama_v2_hf', 'exllama-v2_hf', 'exllama2-hf', 'exllama2_hf', 'exllama-2-hf', 'exllama_2_hf', 'exllama-2_hf']:
|
elif name in ['exllamav2-hf', 'exllamav2_hf', 'exllama-v2-hf', 'exllama_v2_hf', 'exllama-v2_hf', 'exllama2-hf', 'exllama2_hf', 'exllama-2-hf', 'exllama_2_hf', 'exllama-2_hf']:
|
||||||
return 'ExLlamav2_HF'
|
return 'ExLlamav2_HF'
|
||||||
elif name in ['autoawq', 'awq', 'auto-awq']:
|
|
||||||
return 'AutoAWQ'
|
|
||||||
elif name in ['hqq']:
|
elif name in ['hqq']:
|
||||||
return 'HQQ'
|
return 'HQQ'
|
||||||
elif name in ['tensorrt', 'tensorrtllm', 'tensorrt_llm', 'tensorrt-llm', 'tensort', 'tensortllm']:
|
elif name in ['tensorrt', 'tensorrtllm', 'tensorrt_llm', 'tensorrt-llm', 'tensort', 'tensortllm']:
|
||||||
|
@ -78,7 +78,6 @@ def list_model_elements():
|
|||||||
'groupsize',
|
'groupsize',
|
||||||
'triton',
|
'triton',
|
||||||
'desc_act',
|
'desc_act',
|
||||||
'no_inject_fused_attention',
|
|
||||||
'no_inject_fused_mlp',
|
'no_inject_fused_mlp',
|
||||||
'no_use_cuda_fp16',
|
'no_use_cuda_fp16',
|
||||||
'disable_exllama',
|
'disable_exllama',
|
||||||
|
@ -127,7 +127,6 @@ def create_ui():
|
|||||||
shared.gradio['no_offload_kqv'] = gr.Checkbox(label="no_offload_kqv", value=shared.args.no_offload_kqv, info='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.')
|
shared.gradio['no_offload_kqv'] = gr.Checkbox(label="no_offload_kqv", value=shared.args.no_offload_kqv, info='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.')
|
||||||
shared.gradio['no_mul_mat_q'] = gr.Checkbox(label="no_mul_mat_q", value=shared.args.no_mul_mat_q, info='Disable the mulmat kernels.')
|
shared.gradio['no_mul_mat_q'] = gr.Checkbox(label="no_mul_mat_q", value=shared.args.no_mul_mat_q, info='Disable the mulmat kernels.')
|
||||||
shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)
|
shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)
|
||||||
shared.gradio['no_inject_fused_attention'] = gr.Checkbox(label="no_inject_fused_attention", value=shared.args.no_inject_fused_attention, info='Disable fused attention. Fused attention improves inference performance but uses more VRAM. Fuses layers for AutoAWQ. Disable if running low on VRAM.')
|
|
||||||
shared.gradio['no_inject_fused_mlp'] = gr.Checkbox(label="no_inject_fused_mlp", value=shared.args.no_inject_fused_mlp, info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.')
|
shared.gradio['no_inject_fused_mlp'] = gr.Checkbox(label="no_inject_fused_mlp", value=shared.args.no_inject_fused_mlp, info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.')
|
||||||
shared.gradio['no_use_cuda_fp16'] = gr.Checkbox(label="no_use_cuda_fp16", value=shared.args.no_use_cuda_fp16, info='This can make models faster on some systems.')
|
shared.gradio['no_use_cuda_fp16'] = gr.Checkbox(label="no_use_cuda_fp16", value=shared.args.no_use_cuda_fp16, info='This can make models faster on some systems.')
|
||||||
shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.')
|
shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.')
|
||||||
|
Loading…
Reference in New Issue
Block a user