mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2024-10-30 22:20:14 +01:00
8b66d83aa9
This increases tokens/second for HF loaders.
532 lines
12 KiB
Python
532 lines
12 KiB
Python
import functools
|
|
from collections import OrderedDict
|
|
|
|
import gradio as gr
|
|
|
|
from modules import shared
|
|
|
|
loaders_and_params = OrderedDict({
|
|
'Transformers': [
|
|
'cpu_memory',
|
|
'gpu_memory',
|
|
'load_in_8bit',
|
|
'bf16',
|
|
'cpu',
|
|
'disk',
|
|
'auto_devices',
|
|
'load_in_4bit',
|
|
'use_double_quant',
|
|
'quant_type',
|
|
'compute_dtype',
|
|
'trust_remote_code',
|
|
'no_use_fast',
|
|
'use_flash_attention_2',
|
|
'alpha_value',
|
|
'rope_freq_base',
|
|
'compress_pos_emb',
|
|
'disable_exllama',
|
|
'transformers_info'
|
|
],
|
|
'ExLlama_HF': [
|
|
'gpu_split',
|
|
'max_seq_len',
|
|
'alpha_value',
|
|
'rope_freq_base',
|
|
'compress_pos_emb',
|
|
'cfg_cache',
|
|
'no_use_fast',
|
|
'exllama_HF_info',
|
|
],
|
|
'ExLlamav2_HF': [
|
|
'gpu_split',
|
|
'max_seq_len',
|
|
'cfg_cache',
|
|
'no_flash_attn',
|
|
'cache_8bit',
|
|
'alpha_value',
|
|
'compress_pos_emb',
|
|
'no_use_fast',
|
|
],
|
|
'ExLlama': [
|
|
'gpu_split',
|
|
'max_seq_len',
|
|
'alpha_value',
|
|
'rope_freq_base',
|
|
'compress_pos_emb',
|
|
'exllama_info',
|
|
],
|
|
'ExLlamav2': [
|
|
'gpu_split',
|
|
'max_seq_len',
|
|
'no_flash_attn',
|
|
'cache_8bit',
|
|
'alpha_value',
|
|
'compress_pos_emb',
|
|
],
|
|
'AutoGPTQ': [
|
|
'triton',
|
|
'no_inject_fused_attention',
|
|
'no_inject_fused_mlp',
|
|
'no_use_cuda_fp16',
|
|
'wbits',
|
|
'groupsize',
|
|
'desc_act',
|
|
'disable_exllama',
|
|
'gpu_memory',
|
|
'cpu_memory',
|
|
'cpu',
|
|
'disk',
|
|
'auto_devices',
|
|
'trust_remote_code',
|
|
'no_use_fast',
|
|
'autogptq_info',
|
|
],
|
|
'GPTQ-for-LLaMa': [
|
|
'wbits',
|
|
'groupsize',
|
|
'model_type',
|
|
'pre_layer',
|
|
'no_use_fast',
|
|
'gptq_for_llama_info',
|
|
],
|
|
'llama.cpp': [
|
|
'n_ctx',
|
|
'n_gpu_layers',
|
|
'tensor_split',
|
|
'n_batch',
|
|
'threads',
|
|
'threads_batch',
|
|
'no_mmap',
|
|
'mlock',
|
|
'no_mul_mat_q',
|
|
'alpha_value',
|
|
'rope_freq_base',
|
|
'compress_pos_emb',
|
|
'numa',
|
|
],
|
|
'llamacpp_HF': [
|
|
'n_ctx',
|
|
'n_gpu_layers',
|
|
'tensor_split',
|
|
'n_batch',
|
|
'threads',
|
|
'threads_batch',
|
|
'no_mmap',
|
|
'mlock',
|
|
'no_mul_mat_q',
|
|
'alpha_value',
|
|
'rope_freq_base',
|
|
'compress_pos_emb',
|
|
'numa',
|
|
'cfg_cache',
|
|
'no_use_fast',
|
|
'logits_all',
|
|
'llamacpp_HF_info',
|
|
],
|
|
'ctransformers': [
|
|
'n_ctx',
|
|
'n_gpu_layers',
|
|
'n_batch',
|
|
'threads',
|
|
'model_type',
|
|
'no_mmap',
|
|
'mlock'
|
|
],
|
|
'AutoAWQ': [
|
|
'cpu_memory',
|
|
'gpu_memory',
|
|
'auto_devices',
|
|
'max_seq_len',
|
|
'no_inject_fused_attention',
|
|
'trust_remote_code',
|
|
'no_use_fast',
|
|
]
|
|
})
|
|
|
|
loaders_samplers = {
|
|
'Transformers': {
|
|
'temperature',
|
|
'temperature_last',
|
|
'top_p',
|
|
'min_p',
|
|
'top_k',
|
|
'typical_p',
|
|
'epsilon_cutoff',
|
|
'eta_cutoff',
|
|
'tfs',
|
|
'top_a',
|
|
'repetition_penalty',
|
|
'presence_penalty',
|
|
'frequency_penalty',
|
|
'repetition_penalty_range',
|
|
'encoder_repetition_penalty',
|
|
'no_repeat_ngram_size',
|
|
'min_length',
|
|
'seed',
|
|
'do_sample',
|
|
'penalty_alpha',
|
|
'num_beams',
|
|
'length_penalty',
|
|
'early_stopping',
|
|
'mirostat_mode',
|
|
'mirostat_tau',
|
|
'mirostat_eta',
|
|
'grammar_file_row',
|
|
'grammar_string',
|
|
'guidance_scale',
|
|
'negative_prompt',
|
|
'ban_eos_token',
|
|
'custom_token_bans',
|
|
'add_bos_token',
|
|
'skip_special_tokens',
|
|
'auto_max_new_tokens',
|
|
},
|
|
'ExLlama_HF': {
|
|
'temperature',
|
|
'temperature_last',
|
|
'top_p',
|
|
'min_p',
|
|
'top_k',
|
|
'typical_p',
|
|
'epsilon_cutoff',
|
|
'eta_cutoff',
|
|
'tfs',
|
|
'top_a',
|
|
'repetition_penalty',
|
|
'presence_penalty',
|
|
'frequency_penalty',
|
|
'repetition_penalty_range',
|
|
'encoder_repetition_penalty',
|
|
'no_repeat_ngram_size',
|
|
'min_length',
|
|
'seed',
|
|
'do_sample',
|
|
'mirostat_mode',
|
|
'mirostat_tau',
|
|
'mirostat_eta',
|
|
'grammar_file_row',
|
|
'grammar_string',
|
|
'guidance_scale',
|
|
'negative_prompt',
|
|
'ban_eos_token',
|
|
'custom_token_bans',
|
|
'add_bos_token',
|
|
'skip_special_tokens',
|
|
'auto_max_new_tokens',
|
|
},
|
|
'ExLlama': {
|
|
'temperature',
|
|
'top_p',
|
|
'top_k',
|
|
'typical_p',
|
|
'repetition_penalty',
|
|
'repetition_penalty_range',
|
|
'seed',
|
|
'guidance_scale',
|
|
'negative_prompt',
|
|
'ban_eos_token',
|
|
'add_bos_token',
|
|
'custom_token_bans',
|
|
'auto_max_new_tokens',
|
|
},
|
|
'ExLlamav2': {
|
|
'temperature',
|
|
'top_p',
|
|
'min_p',
|
|
'top_k',
|
|
'typical_p',
|
|
'tfs',
|
|
'repetition_penalty',
|
|
'repetition_penalty_range',
|
|
'seed',
|
|
'mirostat_mode',
|
|
'mirostat_tau',
|
|
'mirostat_eta',
|
|
'ban_eos_token',
|
|
'add_bos_token',
|
|
'custom_token_bans',
|
|
'skip_special_tokens',
|
|
'auto_max_new_tokens',
|
|
},
|
|
'ExLlamav2_HF': {
|
|
'temperature',
|
|
'temperature_last',
|
|
'top_p',
|
|
'min_p',
|
|
'top_k',
|
|
'typical_p',
|
|
'epsilon_cutoff',
|
|
'eta_cutoff',
|
|
'tfs',
|
|
'top_a',
|
|
'repetition_penalty',
|
|
'presence_penalty',
|
|
'frequency_penalty',
|
|
'repetition_penalty_range',
|
|
'encoder_repetition_penalty',
|
|
'no_repeat_ngram_size',
|
|
'min_length',
|
|
'seed',
|
|
'do_sample',
|
|
'mirostat_mode',
|
|
'mirostat_tau',
|
|
'mirostat_eta',
|
|
'grammar_file_row',
|
|
'grammar_string',
|
|
'guidance_scale',
|
|
'negative_prompt',
|
|
'ban_eos_token',
|
|
'custom_token_bans',
|
|
'add_bos_token',
|
|
'skip_special_tokens',
|
|
'auto_max_new_tokens',
|
|
},
|
|
'AutoGPTQ': {
|
|
'temperature',
|
|
'temperature_last',
|
|
'top_p',
|
|
'min_p',
|
|
'top_k',
|
|
'typical_p',
|
|
'epsilon_cutoff',
|
|
'eta_cutoff',
|
|
'tfs',
|
|
'top_a',
|
|
'repetition_penalty',
|
|
'presence_penalty',
|
|
'frequency_penalty',
|
|
'repetition_penalty_range',
|
|
'encoder_repetition_penalty',
|
|
'no_repeat_ngram_size',
|
|
'min_length',
|
|
'seed',
|
|
'do_sample',
|
|
'penalty_alpha',
|
|
'num_beams',
|
|
'length_penalty',
|
|
'early_stopping',
|
|
'mirostat_mode',
|
|
'mirostat_tau',
|
|
'mirostat_eta',
|
|
'grammar_file_row',
|
|
'grammar_string',
|
|
'guidance_scale',
|
|
'negative_prompt',
|
|
'ban_eos_token',
|
|
'custom_token_bans',
|
|
'add_bos_token',
|
|
'skip_special_tokens',
|
|
'auto_max_new_tokens',
|
|
},
|
|
'GPTQ-for-LLaMa': {
|
|
'temperature',
|
|
'temperature_last',
|
|
'top_p',
|
|
'min_p',
|
|
'top_k',
|
|
'typical_p',
|
|
'epsilon_cutoff',
|
|
'eta_cutoff',
|
|
'tfs',
|
|
'top_a',
|
|
'repetition_penalty',
|
|
'presence_penalty',
|
|
'frequency_penalty',
|
|
'repetition_penalty_range',
|
|
'encoder_repetition_penalty',
|
|
'no_repeat_ngram_size',
|
|
'min_length',
|
|
'seed',
|
|
'do_sample',
|
|
'penalty_alpha',
|
|
'num_beams',
|
|
'length_penalty',
|
|
'early_stopping',
|
|
'mirostat_mode',
|
|
'mirostat_tau',
|
|
'mirostat_eta',
|
|
'grammar_file_row',
|
|
'grammar_string',
|
|
'guidance_scale',
|
|
'negative_prompt',
|
|
'ban_eos_token',
|
|
'custom_token_bans',
|
|
'add_bos_token',
|
|
'skip_special_tokens',
|
|
'auto_max_new_tokens',
|
|
},
|
|
'llama.cpp': {
|
|
'temperature',
|
|
'top_p',
|
|
'top_k',
|
|
'tfs',
|
|
'repetition_penalty',
|
|
'presence_penalty',
|
|
'frequency_penalty',
|
|
'seed',
|
|
'mirostat_mode',
|
|
'mirostat_tau',
|
|
'mirostat_eta',
|
|
'grammar_file_row',
|
|
'grammar_string',
|
|
'ban_eos_token',
|
|
'custom_token_bans',
|
|
},
|
|
'llamacpp_HF': {
|
|
'temperature',
|
|
'temperature_last',
|
|
'top_p',
|
|
'min_p',
|
|
'top_k',
|
|
'typical_p',
|
|
'epsilon_cutoff',
|
|
'eta_cutoff',
|
|
'tfs',
|
|
'top_a',
|
|
'repetition_penalty',
|
|
'presence_penalty',
|
|
'frequency_penalty',
|
|
'repetition_penalty_range',
|
|
'encoder_repetition_penalty',
|
|
'no_repeat_ngram_size',
|
|
'min_length',
|
|
'seed',
|
|
'do_sample',
|
|
'mirostat_mode',
|
|
'mirostat_tau',
|
|
'mirostat_eta',
|
|
'grammar_file_row',
|
|
'grammar_string',
|
|
'guidance_scale',
|
|
'negative_prompt',
|
|
'ban_eos_token',
|
|
'custom_token_bans',
|
|
'add_bos_token',
|
|
'skip_special_tokens',
|
|
'auto_max_new_tokens',
|
|
},
|
|
'ctransformers': {
|
|
'temperature',
|
|
'top_p',
|
|
'top_k',
|
|
'repetition_penalty',
|
|
'repetition_penalty_range',
|
|
},
|
|
'AutoAWQ': {
|
|
'temperature',
|
|
'temperature_last',
|
|
'top_p',
|
|
'min_p',
|
|
'top_k',
|
|
'typical_p',
|
|
'epsilon_cutoff',
|
|
'eta_cutoff',
|
|
'tfs',
|
|
'top_a',
|
|
'repetition_penalty',
|
|
'presence_penalty',
|
|
'frequency_penalty',
|
|
'repetition_penalty_range',
|
|
'encoder_repetition_penalty',
|
|
'no_repeat_ngram_size',
|
|
'min_length',
|
|
'seed',
|
|
'do_sample',
|
|
'penalty_alpha',
|
|
'num_beams',
|
|
'length_penalty',
|
|
'early_stopping',
|
|
'mirostat_mode',
|
|
'mirostat_tau',
|
|
'mirostat_eta',
|
|
'grammar_file_row',
|
|
'grammar_string',
|
|
'guidance_scale',
|
|
'negative_prompt',
|
|
'ban_eos_token',
|
|
'custom_token_bans',
|
|
'add_bos_token',
|
|
'skip_special_tokens',
|
|
'auto_max_new_tokens',
|
|
},
|
|
}
|
|
|
|
loaders_model_types = {
|
|
'GPTQ-for-LLaMa': [
|
|
"None",
|
|
"llama",
|
|
"opt",
|
|
"gptj"
|
|
],
|
|
'ctransformers': [
|
|
"None",
|
|
"gpt2",
|
|
"gptj",
|
|
"gptneox",
|
|
"llama",
|
|
"mpt",
|
|
"dollyv2",
|
|
"replit",
|
|
"starcoder",
|
|
"gptbigcode",
|
|
"falcon"
|
|
],
|
|
}
|
|
|
|
|
|
@functools.cache
|
|
def list_all_samplers():
|
|
all_samplers = set()
|
|
for k in loaders_samplers:
|
|
for sampler in loaders_samplers[k]:
|
|
all_samplers.add(sampler)
|
|
|
|
return sorted(all_samplers)
|
|
|
|
|
|
def blacklist_samplers(loader):
|
|
all_samplers = list_all_samplers()
|
|
if loader == 'All':
|
|
return [gr.update(visible=True) for sampler in all_samplers]
|
|
else:
|
|
return [gr.update(visible=True) if sampler in loaders_samplers[loader] else gr.update(visible=False) for sampler in all_samplers]
|
|
|
|
|
|
def get_model_types(loader):
|
|
if loader in loaders_model_types:
|
|
return loaders_model_types[loader]
|
|
|
|
return ["None"]
|
|
|
|
|
|
def get_gpu_memory_keys():
|
|
return [k for k in shared.gradio if k.startswith('gpu_memory')]
|
|
|
|
|
|
@functools.cache
|
|
def get_all_params():
|
|
all_params = set()
|
|
for k in loaders_and_params:
|
|
for el in loaders_and_params[k]:
|
|
all_params.add(el)
|
|
|
|
if 'gpu_memory' in all_params:
|
|
all_params.remove('gpu_memory')
|
|
for k in get_gpu_memory_keys():
|
|
all_params.add(k)
|
|
|
|
return sorted(all_params)
|
|
|
|
|
|
def make_loader_params_visible(loader):
|
|
params = []
|
|
all_params = get_all_params()
|
|
if loader in loaders_and_params:
|
|
params = loaders_and_params[loader]
|
|
|
|
if 'gpu_memory' in params:
|
|
params.remove('gpu_memory')
|
|
params += get_gpu_memory_keys()
|
|
|
|
return [gr.update(visible=True) if k in params else gr.update(visible=False) for k in all_params]
|