AutoAWQ: initial support (#3999)

This commit is contained in:
cal066 2023-10-05 16:19:18 +00:00 committed by GitHub
parent 3f56151f03
commit cc632c3f33
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 75 additions and 3 deletions

View File

@ -174,3 +174,5 @@
instruction_template: 'Llama-v2'
.*mistral.*instruct:
instruction_template: 'Mistral'
.*AWQ:
n_batch: 1

View File

@ -129,6 +129,16 @@ loaders_and_params = OrderedDict({
'model_type',
'no_mmap',
'mlock'
],
'AutoAWQ': [
'cpu_memory',
'gpu_memory',
'auto_devices',
'max_seq_len',
'n_batch',
'no_inject_fused_attention',
'trust_remote_code',
'use_fast',
]
})
@ -365,7 +375,40 @@ loaders_samplers = {
'top_k',
'repetition_penalty',
'repetition_penalty_range',
}
},
'AutoAWQ': {
'temperature',
'top_p',
'top_k',
'typical_p',
'epsilon_cutoff',
'eta_cutoff',
'tfs',
'top_a',
'repetition_penalty',
'repetition_penalty_range',
'encoder_repetition_penalty',
'no_repeat_ngram_size',
'min_length',
'seed',
'do_sample',
'penalty_alpha',
'num_beams',
'length_penalty',
'early_stopping',
'mirostat_mode',
'mirostat_tau',
'mirostat_eta',
'grammar_file_row',
'grammar_string',
'guidance_scale',
'negative_prompt',
'ban_eos_token',
'custom_token_bans',
'add_bos_token',
'skip_special_tokens',
'auto_max_new_tokens',
},
}
loaders_model_types = {

View File

@ -63,6 +63,7 @@ def load_model(model_name, loader=None):
'ExLlamav2': ExLlamav2_loader,
'ExLlamav2_HF': ExLlamav2_HF_loader,
'ctransformers': ctransformers_loader,
'AutoAWQ': AutoAWQ_loader,
}
if loader is None:
@ -276,6 +277,24 @@ def ctransformers_loader(model_name):
model, tokenizer = ctrans.from_pretrained(model_file)
return model, tokenizer
def AutoAWQ_loader(model_name):
from awq import AutoAWQForCausalLM
model_dir = Path(f'{shared.args.model_dir}/{model_name}')
if shared.args.deepspeed:
logger.warn("AutoAWQ is incompatible with deepspeed")
model = AutoAWQForCausalLM.from_quantized(
quant_path=model_dir,
max_new_tokens=shared.args.max_seq_len,
trust_remote_code=shared.args.trust_remote_code,
fuse_layers=not shared.args.no_inject_fused_attention,
max_memory=get_max_memory_dict(),
batch_size=shared.args.n_batch,
safetensors=not shared.args.trust_remote_code)
return model
def GPTQ_loader(model_name):

View File

@ -107,10 +107,14 @@ def infer_loader(model_name, model_settings):
loader = None
elif (path_to_model / 'quantize_config.json').exists() or ('wbits' in model_settings and type(model_settings['wbits']) is int and model_settings['wbits'] > 0):
loader = 'AutoGPTQ'
elif (path_to_model / 'quant_config.json').exists():
loader = 'AutoAWQ'
elif len(list(path_to_model.glob('*.gguf'))) > 0:
loader = 'llama.cpp'
elif re.match(r'.*\.gguf', model_name.lower()):
loader = 'llama.cpp'
elif re.match(r'.*-awq', model_name.lower()):
loader = 'AutoAWQ'
elif re.match(r'.*rwkv.*\.pth', model_name.lower()):
loader = 'RWKV'
elif re.match(r'.*exl2', model_name.lower()):

View File

@ -232,6 +232,8 @@ def fix_loader_name(name):
return 'ExLlamav2_HF'
elif name in ['ctransformers', 'ctranforemrs', 'ctransformer']:
return 'ctransformers'
elif name in ['autoawq', 'awq', 'auto-awq']:
return 'AutoAWQ'
def add_extension(name):

View File

@ -99,7 +99,7 @@ def create_ui():
with gr.Column():
shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)
shared.gradio['no_inject_fused_attention'] = gr.Checkbox(label="no_inject_fused_attention", value=shared.args.no_inject_fused_attention, info='Disable fused attention. Fused attention improves inference performance but uses more VRAM. Disable if running low on VRAM.')
shared.gradio['no_inject_fused_attention'] = gr.Checkbox(label="no_inject_fused_attention", value=shared.args.no_inject_fused_attention, info='Disable fused attention. Fused attention improves inference performance but uses more VRAM. Fuses layers for AutoAWQ. Disable if running low on VRAM.')
shared.gradio['no_inject_fused_mlp'] = gr.Checkbox(label="no_inject_fused_mlp", value=shared.args.no_inject_fused_mlp, info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.')
shared.gradio['no_use_cuda_fp16'] = gr.Checkbox(label="no_use_cuda_fp16", value=shared.args.no_use_cuda_fp16, info='This can make models faster on some systems.')
shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.')

View File

@ -48,3 +48,4 @@ https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/text
https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/jllllll/ctransformers-cuBLAS-wheels/releases/download/AVX2/ctransformers-0.2.27+cu117-py3-none-any.whl
autoawq==0.1.2

View File

@ -48,3 +48,4 @@ https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/text
https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/jllllll/ctransformers-cuBLAS-wheels/releases/download/AVX2/ctransformers-0.2.27+cu117-py3-none-any.whl
autoawq==0.1.2