mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-01 00:39:00 +01:00
llama : fix kv cache heuristic when context is less than 32
This commit is contained in:
parent
8845160058
commit
c1596f633f
@ -4117,7 +4117,7 @@ static int llama_decode_internal(
|
||||
// after enough generations, the benefit from this heuristic disappears
|
||||
// if we start defragmenting the cache, the benefit from this will be more important
|
||||
//kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA?
|
||||
kv_self.n = std::max(32, llama_kv_cache_cell_max(kv_self));
|
||||
kv_self.n = std::min((int32_t) hparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self)));
|
||||
|
||||
//printf("kv_self.n = %d\n", kv_self.n);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user