From 57dd55e2c742bfc50e0f5c6fb95c14118cff44f6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 12 Oct 2023 09:29:04 +0300 Subject: [PATCH] server : fix kv cache management (#3588) --- examples/server/server.cpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index d992feeef..ee0ababb1 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -405,6 +405,7 @@ struct llama_server_context // compare the evaluated prompt with the new prompt n_past = common_part(embd, prompt_tokens); embd = prompt_tokens; + if (n_past == num_prompt_tokens) { // we have to evaluate at least 1 token to generate logits. @@ -412,6 +413,9 @@ struct llama_server_context n_past--; } + // since #3228 we now have to manually manage the KV cache + llama_kv_cache_seq_rm(ctx, 0, n_past, -1); + LOG_VERBOSE("prompt ingested", { {"n_past", n_past}, {"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)}, @@ -461,9 +465,6 @@ struct llama_server_context // compare the evaluated prompt with the new prompt n_past = common_part(embd, prompt_tokens); - // since #3228 we now have to manually manage the KV cache - llama_kv_cache_seq_rm(ctx, 0, n_past, -1); - embd = prompt_tokens; if (n_past == num_prompt_tokens) { @@ -471,6 +472,9 @@ struct llama_server_context n_past--; } + // since #3228 we now have to manually manage the KV cache + llama_kv_cache_seq_rm(ctx, 0, n_past, -1); + LOG_VERBOSE("prompt ingested", { {"n_past", n_past}, {"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},