From 57dd55e2c742bfc50e0f5c6fb95c14118cff44f6 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 12 Oct 2023 09:29:04 +0300
Subject: [PATCH] server : fix kv cache management (#3588)

---
 examples/server/server.cpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index d992feeef..ee0ababb1 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -405,6 +405,7 @@ struct llama_server_context
         // compare the evaluated prompt with the new prompt
         n_past = common_part(embd, prompt_tokens);
         embd = prompt_tokens;
+
         if (n_past == num_prompt_tokens)
         {
             // we have to evaluate at least 1 token to generate logits.
@@ -412,6 +413,9 @@ struct llama_server_context
             n_past--;
         }
 
+        // since #3228 we now have to manually manage the KV cache
+        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
+
         LOG_VERBOSE("prompt ingested", {
                                            {"n_past", n_past},
                                            {"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
@@ -461,9 +465,6 @@ struct llama_server_context
         // compare the evaluated prompt with the new prompt
         n_past = common_part(embd, prompt_tokens);
 
-        // since #3228 we now have to manually manage the KV cache
-        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
-
         embd = prompt_tokens;
         if (n_past == num_prompt_tokens)
         {
@@ -471,6 +472,9 @@ struct llama_server_context
             n_past--;
         }
 
+        // since #3228 we now have to manually manage the KV cache
+        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
+
         LOG_VERBOSE("prompt ingested", {
                                            {"n_past", n_past},
                                            {"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},