diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 3587d9014..c5b1328d9 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -434,6 +434,10 @@ struct llama_server_context { n_eval = params.n_batch; } + + // since #3228 we now have to manually manage the KV cache + llama_kv_cache_tokens_rm(ctx, n_past, -1); + if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval, n_past, 0), params.n_threads)) { LOG_ERROR("failed to eval", {