From c1596f633fa141f4cde8a92bb8895fd10dc91869 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 27 Sep 2023 18:12:43 +0300
Subject: [PATCH] llama : fix kv cache heuristic when context is less than 32

---
 llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index 73a636cea..b409b0d12 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4117,7 +4117,7 @@ static int llama_decode_internal(
     // after enough generations, the benefit from this heuristic disappears
     // if we start defragmenting the cache, the benefit from this will be more important
     //kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32));   // TODO: this might be better for CUDA?
-    kv_self.n = std::max(32, llama_kv_cache_cell_max(kv_self));
+    kv_self.n = std::min((int32_t) hparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self)));
 
     //printf("kv_self.n = %d\n", kv_self.n);