From 12c2bdf2de34f747d13b270fc9d3b52490bf194f Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Wed, 22 Jan 2025 17:44:40 +0100 Subject: [PATCH] server : fix draft context not being released (#11354) --- examples/server/server.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 412908aa8..4cfb3c9bb 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1772,6 +1772,9 @@ struct server_context { // force F16 KV cache for the draft model for extra performance cparams_dft.type_k = GGML_TYPE_F16; cparams_dft.type_v = GGML_TYPE_F16; + + // the context is not needed - we will create one for each slot + llama_init_dft.context.reset(); } chat_templates = common_chat_templates_from_model(model, params_base.chat_template);