mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2024-11-21 23:57:58 +01:00
llama.cpp: add 4-bit/8-bit kv cache options
This commit is contained in:
parent
220c1797fc
commit
4ea260098f
@ -221,6 +221,13 @@ class LlamacppHF(PreTrainedModel):
|
||||
'flash_attn': shared.args.flash_attn
|
||||
}
|
||||
|
||||
if shared.args.cache_4bit:
|
||||
params["type_k"] = 2
|
||||
params["type_v"] = 2
|
||||
elif shared.args.cache_8bit:
|
||||
params["type_k"] = 8
|
||||
params["type_v"] = 8
|
||||
|
||||
Llama = llama_cpp_lib().Llama
|
||||
model = Llama(**params)
|
||||
|
||||
|
@ -100,6 +100,13 @@ class LlamaCppModel:
|
||||
'flash_attn': shared.args.flash_attn
|
||||
}
|
||||
|
||||
if shared.args.cache_4bit:
|
||||
params["type_k"] = 2
|
||||
params["type_v"] = 2
|
||||
elif shared.args.cache_8bit:
|
||||
params["type_k"] = 8
|
||||
params["type_v"] = 8
|
||||
|
||||
result.model = Llama(**params)
|
||||
if cache_capacity > 0:
|
||||
result.model.set_cache(LlamaCache(capacity_bytes=cache_capacity))
|
||||
|
@ -30,6 +30,8 @@ loaders_and_params = OrderedDict({
|
||||
'llama.cpp': [
|
||||
'n_ctx',
|
||||
'n_gpu_layers',
|
||||
'cache_8bit',
|
||||
'cache_4bit',
|
||||
'tensor_split',
|
||||
'n_batch',
|
||||
'threads',
|
||||
@ -51,6 +53,8 @@ loaders_and_params = OrderedDict({
|
||||
'llamacpp_HF': [
|
||||
'n_ctx',
|
||||
'n_gpu_layers',
|
||||
'cache_8bit',
|
||||
'cache_4bit',
|
||||
'tensor_split',
|
||||
'n_batch',
|
||||
'threads',
|
||||
|
Loading…
Reference in New Issue
Block a user