oobabooga 2a1063eff5 Revert "Remove non-HF ExLlamaV2 loader (#5431)"
This reverts commit cde000d47801fa13c5a88f9e435da64132bd96bc.
2024-02-06 06:21:36 -08:00

425 lines
9.5 KiB
Python

import functools
from collections import OrderedDict
import gradio as gr
from modules import shared
loaders_and_params = OrderedDict({
'Transformers': [
'cpu_memory',
'gpu_memory',
'load_in_8bit',
'bf16',
'cpu',
'disk',
'auto_devices',
'load_in_4bit',
'use_double_quant',
'quant_type',
'compute_dtype',
'trust_remote_code',
'no_use_fast',
'use_flash_attention_2',
'alpha_value',
'rope_freq_base',
'compress_pos_emb',
'disable_exllama',
'disable_exllamav2',
'transformers_info',
],
'llama.cpp': [
'n_ctx',
'n_gpu_layers',
'tensor_split',
'n_batch',
'threads',
'threads_batch',
'no_mmap',
'mlock',
'no_mul_mat_q',
'alpha_value',
'rope_freq_base',
'compress_pos_emb',
'cpu',
'numa',
'no_offload_kqv',
'row_split',
'tensorcores',
],
'llamacpp_HF': [
'n_ctx',
'n_gpu_layers',
'tensor_split',
'n_batch',
'threads',
'threads_batch',
'no_mmap',
'mlock',
'no_mul_mat_q',
'alpha_value',
'rope_freq_base',
'compress_pos_emb',
'cpu',
'numa',
'cfg_cache',
'trust_remote_code',
'no_use_fast',
'logits_all',
'no_offload_kqv',
'row_split',
'tensorcores',
'llamacpp_HF_info',
],
'ExLlamav2_HF': [
'gpu_split',
'max_seq_len',
'cfg_cache',
'no_flash_attn',
'num_experts_per_token',
'cache_8bit',
'alpha_value',
'compress_pos_emb',
'trust_remote_code',
'no_use_fast',
],
'ExLlamav2': [
'gpu_split',
'max_seq_len',
'no_flash_attn',
'num_experts_per_token',
'cache_8bit',
'alpha_value',
'compress_pos_emb',
'exllamav2_info',
],
'AutoGPTQ': [
'triton',
'no_inject_fused_attention',
'no_inject_fused_mlp',
'no_use_cuda_fp16',
'wbits',
'groupsize',
'desc_act',
'disable_exllama',
'disable_exllamav2',
'gpu_memory',
'cpu_memory',
'cpu',
'disk',
'auto_devices',
'trust_remote_code',
'no_use_fast',
'autogptq_info',
],
'AutoAWQ': [
'cpu_memory',
'gpu_memory',
'auto_devices',
'max_seq_len',
'no_inject_fused_attention',
'trust_remote_code',
'no_use_fast',
],
'GPTQ-for-LLaMa': [
'wbits',
'groupsize',
'model_type',
'pre_layer',
'trust_remote_code',
'no_use_fast',
'gptq_for_llama_info',
],
'ctransformers': [
'n_ctx',
'n_gpu_layers',
'n_batch',
'threads',
'model_type',
'no_mmap',
'mlock'
],
'QuIP#': [
'trust_remote_code',
'no_use_fast',
'no_flash_attn',
'quipsharp_info',
],
'HQQ': [
'hqq_backend',
'trust_remote_code',
'no_use_fast',
]
})
def transformers_samplers():
return {
'temperature',
'temperature_last',
'dynamic_temperature',
'dynatemp_low',
'dynatemp_high',
'dynatemp_exponent',
'smoothing_factor',
'top_p',
'min_p',
'top_k',
'typical_p',
'epsilon_cutoff',
'eta_cutoff',
'tfs',
'top_a',
'repetition_penalty',
'presence_penalty',
'frequency_penalty',
'repetition_penalty_range',
'encoder_repetition_penalty',
'no_repeat_ngram_size',
'min_length',
'seed',
'do_sample',
'penalty_alpha',
'num_beams',
'length_penalty',
'early_stopping',
'mirostat_mode',
'mirostat_tau',
'mirostat_eta',
'grammar_file_row',
'grammar_string',
'guidance_scale',
'negative_prompt',
'ban_eos_token',
'custom_token_bans',
'sampler_priority',
'add_bos_token',
'skip_special_tokens',
'auto_max_new_tokens',
'prompt_lookup_num_tokens'
}
loaders_samplers = {
'Transformers': transformers_samplers(),
'AutoGPTQ': transformers_samplers(),
'GPTQ-for-LLaMa': transformers_samplers(),
'AutoAWQ': transformers_samplers(),
'QuIP#': transformers_samplers(),
'HQQ': transformers_samplers(),
'ExLlamav2': {
'temperature',
'temperature_last',
'top_p',
'min_p',
'top_k',
'typical_p',
'tfs',
'top_a',
'repetition_penalty',
'presence_penalty',
'frequency_penalty',
'repetition_penalty_range',
'seed',
'mirostat_mode',
'mirostat_tau',
'mirostat_eta',
'ban_eos_token',
'add_bos_token',
'custom_token_bans',
'skip_special_tokens',
'auto_max_new_tokens',
},
'ExLlamav2_HF': {
'temperature',
'temperature_last',
'dynamic_temperature',
'dynatemp_low',
'dynatemp_high',
'dynatemp_exponent',
'smoothing_factor',
'top_p',
'min_p',
'top_k',
'typical_p',
'epsilon_cutoff',
'eta_cutoff',
'tfs',
'top_a',
'repetition_penalty',
'presence_penalty',
'frequency_penalty',
'repetition_penalty_range',
'encoder_repetition_penalty',
'no_repeat_ngram_size',
'min_length',
'seed',
'do_sample',
'mirostat_mode',
'mirostat_tau',
'mirostat_eta',
'grammar_file_row',
'grammar_string',
'guidance_scale',
'negative_prompt',
'ban_eos_token',
'custom_token_bans',
'sampler_priority',
'add_bos_token',
'skip_special_tokens',
'auto_max_new_tokens',
},
'llama.cpp': {
'temperature',
'top_p',
'min_p',
'top_k',
'typical_p',
'tfs',
'repetition_penalty',
'presence_penalty',
'frequency_penalty',
'seed',
'mirostat_mode',
'mirostat_tau',
'mirostat_eta',
'grammar_file_row',
'grammar_string',
'ban_eos_token',
'custom_token_bans',
},
'llamacpp_HF': {
'temperature',
'temperature_last',
'dynamic_temperature',
'dynatemp_low',
'dynatemp_high',
'dynatemp_exponent',
'smoothing_factor',
'top_p',
'min_p',
'top_k',
'typical_p',
'epsilon_cutoff',
'eta_cutoff',
'tfs',
'top_a',
'repetition_penalty',
'presence_penalty',
'frequency_penalty',
'repetition_penalty_range',
'encoder_repetition_penalty',
'no_repeat_ngram_size',
'min_length',
'seed',
'do_sample',
'mirostat_mode',
'mirostat_tau',
'mirostat_eta',
'grammar_file_row',
'grammar_string',
'guidance_scale',
'negative_prompt',
'ban_eos_token',
'custom_token_bans',
'sampler_priority',
'add_bos_token',
'skip_special_tokens',
'auto_max_new_tokens',
},
'ctransformers': {
'temperature',
'top_p',
'top_k',
'repetition_penalty',
'repetition_penalty_range',
},
}
loaders_model_types = {
'GPTQ-for-LLaMa': [
"None",
"llama",
"opt",
"gptj"
],
'ctransformers': [
"None",
"gpt2",
"gptj",
"gptneox",
"llama",
"mpt",
"dollyv2",
"replit",
"starcoder",
"gptbigcode",
"falcon"
],
}
@functools.cache
def list_all_samplers():
all_samplers = set()
for k in loaders_samplers:
for sampler in loaders_samplers[k]:
all_samplers.add(sampler)
return sorted(all_samplers)
def blacklist_samplers(loader, dynamic_temperature):
all_samplers = list_all_samplers()
output = []
for sampler in all_samplers:
if loader == 'All' or sampler in loaders_samplers[loader]:
if sampler.startswith('dynatemp'):
output.append(gr.update(visible=dynamic_temperature))
else:
output.append(gr.update(visible=True))
else:
output.append(gr.update(visible=False))
return output
def get_model_types(loader):
if loader in loaders_model_types:
return loaders_model_types[loader]
return ["None"]
def get_gpu_memory_keys():
return [k for k in shared.gradio if k.startswith('gpu_memory')]
@functools.cache
def get_all_params():
all_params = set()
for k in loaders_and_params:
for el in loaders_and_params[k]:
all_params.add(el)
if 'gpu_memory' in all_params:
all_params.remove('gpu_memory')
for k in get_gpu_memory_keys():
all_params.add(k)
return sorted(all_params)
def make_loader_params_visible(loader):
params = []
all_params = get_all_params()
if loader in loaders_and_params:
params = loaders_and_params[loader]
if 'gpu_memory' in params:
params.remove('gpu_memory')
params += get_gpu_memory_keys()
return [gr.update(visible=True) if k in params else gr.update(visible=False) for k in all_params]