From 61a88a1da399be2207c8aa0a8a280dffc3f64887 Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Fri, 24 May 2024 22:41:38 -0400 Subject: [PATCH] llama : fix BERT inference without KV cache --- llama.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llama.cpp b/llama.cpp index 6bc5167be..678c49094 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3105,6 +3105,10 @@ static bool llama_cache_init( ggml_context * ctx = it.second; ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); if (!buf) { + if (!has_kv && !has_rs) { + // no buffer was needed, so this is fine + return true; + } LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__); return false; }