mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2024-11-22 08:07:56 +01:00
AutoAWQ: initial support (#3999)
This commit is contained in:
parent
3f56151f03
commit
cc632c3f33
@ -174,3 +174,5 @@
|
||||
instruction_template: 'Llama-v2'
|
||||
.*mistral.*instruct:
|
||||
instruction_template: 'Mistral'
|
||||
.*AWQ:
|
||||
n_batch: 1
|
||||
|
@ -129,6 +129,16 @@ loaders_and_params = OrderedDict({
|
||||
'model_type',
|
||||
'no_mmap',
|
||||
'mlock'
|
||||
],
|
||||
'AutoAWQ': [
|
||||
'cpu_memory',
|
||||
'gpu_memory',
|
||||
'auto_devices',
|
||||
'max_seq_len',
|
||||
'n_batch',
|
||||
'no_inject_fused_attention',
|
||||
'trust_remote_code',
|
||||
'use_fast',
|
||||
]
|
||||
})
|
||||
|
||||
@ -365,7 +375,40 @@ loaders_samplers = {
|
||||
'top_k',
|
||||
'repetition_penalty',
|
||||
'repetition_penalty_range',
|
||||
}
|
||||
},
|
||||
'AutoAWQ': {
|
||||
'temperature',
|
||||
'top_p',
|
||||
'top_k',
|
||||
'typical_p',
|
||||
'epsilon_cutoff',
|
||||
'eta_cutoff',
|
||||
'tfs',
|
||||
'top_a',
|
||||
'repetition_penalty',
|
||||
'repetition_penalty_range',
|
||||
'encoder_repetition_penalty',
|
||||
'no_repeat_ngram_size',
|
||||
'min_length',
|
||||
'seed',
|
||||
'do_sample',
|
||||
'penalty_alpha',
|
||||
'num_beams',
|
||||
'length_penalty',
|
||||
'early_stopping',
|
||||
'mirostat_mode',
|
||||
'mirostat_tau',
|
||||
'mirostat_eta',
|
||||
'grammar_file_row',
|
||||
'grammar_string',
|
||||
'guidance_scale',
|
||||
'negative_prompt',
|
||||
'ban_eos_token',
|
||||
'custom_token_bans',
|
||||
'add_bos_token',
|
||||
'skip_special_tokens',
|
||||
'auto_max_new_tokens',
|
||||
},
|
||||
}
|
||||
|
||||
loaders_model_types = {
|
||||
|
@ -63,6 +63,7 @@ def load_model(model_name, loader=None):
|
||||
'ExLlamav2': ExLlamav2_loader,
|
||||
'ExLlamav2_HF': ExLlamav2_HF_loader,
|
||||
'ctransformers': ctransformers_loader,
|
||||
'AutoAWQ': AutoAWQ_loader,
|
||||
}
|
||||
|
||||
if loader is None:
|
||||
@ -276,6 +277,24 @@ def ctransformers_loader(model_name):
|
||||
model, tokenizer = ctrans.from_pretrained(model_file)
|
||||
return model, tokenizer
|
||||
|
||||
def AutoAWQ_loader(model_name):
|
||||
from awq import AutoAWQForCausalLM
|
||||
|
||||
model_dir = Path(f'{shared.args.model_dir}/{model_name}')
|
||||
|
||||
if shared.args.deepspeed:
|
||||
logger.warn("AutoAWQ is incompatible with deepspeed")
|
||||
|
||||
model = AutoAWQForCausalLM.from_quantized(
|
||||
quant_path=model_dir,
|
||||
max_new_tokens=shared.args.max_seq_len,
|
||||
trust_remote_code=shared.args.trust_remote_code,
|
||||
fuse_layers=not shared.args.no_inject_fused_attention,
|
||||
max_memory=get_max_memory_dict(),
|
||||
batch_size=shared.args.n_batch,
|
||||
safetensors=not shared.args.trust_remote_code)
|
||||
|
||||
return model
|
||||
|
||||
def GPTQ_loader(model_name):
|
||||
|
||||
|
@ -107,10 +107,14 @@ def infer_loader(model_name, model_settings):
|
||||
loader = None
|
||||
elif (path_to_model / 'quantize_config.json').exists() or ('wbits' in model_settings and type(model_settings['wbits']) is int and model_settings['wbits'] > 0):
|
||||
loader = 'AutoGPTQ'
|
||||
elif (path_to_model / 'quant_config.json').exists():
|
||||
loader = 'AutoAWQ'
|
||||
elif len(list(path_to_model.glob('*.gguf'))) > 0:
|
||||
loader = 'llama.cpp'
|
||||
elif re.match(r'.*\.gguf', model_name.lower()):
|
||||
loader = 'llama.cpp'
|
||||
elif re.match(r'.*-awq', model_name.lower()):
|
||||
loader = 'AutoAWQ'
|
||||
elif re.match(r'.*rwkv.*\.pth', model_name.lower()):
|
||||
loader = 'RWKV'
|
||||
elif re.match(r'.*exl2', model_name.lower()):
|
||||
|
@ -232,6 +232,8 @@ def fix_loader_name(name):
|
||||
return 'ExLlamav2_HF'
|
||||
elif name in ['ctransformers', 'ctranforemrs', 'ctransformer']:
|
||||
return 'ctransformers'
|
||||
elif name in ['autoawq', 'awq', 'auto-awq']:
|
||||
return 'AutoAWQ'
|
||||
|
||||
|
||||
def add_extension(name):
|
||||
|
@ -99,7 +99,7 @@ def create_ui():
|
||||
|
||||
with gr.Column():
|
||||
shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)
|
||||
shared.gradio['no_inject_fused_attention'] = gr.Checkbox(label="no_inject_fused_attention", value=shared.args.no_inject_fused_attention, info='Disable fused attention. Fused attention improves inference performance but uses more VRAM. Disable if running low on VRAM.')
|
||||
shared.gradio['no_inject_fused_attention'] = gr.Checkbox(label="no_inject_fused_attention", value=shared.args.no_inject_fused_attention, info='Disable fused attention. Fused attention improves inference performance but uses more VRAM. Fuses layers for AutoAWQ. Disable if running low on VRAM.')
|
||||
shared.gradio['no_inject_fused_mlp'] = gr.Checkbox(label="no_inject_fused_mlp", value=shared.args.no_inject_fused_mlp, info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.')
|
||||
shared.gradio['no_use_cuda_fp16'] = gr.Checkbox(label="no_use_cuda_fp16", value=shared.args.no_use_cuda_fp16, info='This can make models faster on some systems.')
|
||||
shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.')
|
||||
|
@ -48,3 +48,4 @@ https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/text
|
||||
https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
|
||||
https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
https://github.com/jllllll/ctransformers-cuBLAS-wheels/releases/download/AVX2/ctransformers-0.2.27+cu117-py3-none-any.whl
|
||||
autoawq==0.1.2
|
||||
|
@ -48,3 +48,4 @@ https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/text
|
||||
https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
|
||||
https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
https://github.com/jllllll/ctransformers-cuBLAS-wheels/releases/download/AVX2/ctransformers-0.2.27+cu117-py3-none-any.whl
|
||||
autoawq==0.1.2
|
||||
|
Loading…
Reference in New Issue
Block a user