From c1596f633fa141f4cde8a92bb8895fd10dc91869 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 27 Sep 2023 18:12:43 +0300 Subject: [PATCH] llama : fix kv cache heuristic when context is less than 32 --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 73a636cea..b409b0d12 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4117,7 +4117,7 @@ static int llama_decode_internal( // after enough generations, the benefit from this heuristic disappears // if we start defragmenting the cache, the benefit from this will be more important //kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA? - kv_self.n = std::max(32, llama_kv_cache_cell_max(kv_self)); + kv_self.n = std::min((int32_t) hparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self))); //printf("kv_self.n = %d\n", kv_self.n);