From 61a88a1da399be2207c8aa0a8a280dffc3f64887 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Fri, 24 May 2024 22:41:38 -0400
Subject: [PATCH] llama : fix BERT inference without KV cache

---
 llama.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llama.cpp b/llama.cpp
index 6bc5167be..678c49094 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3105,6 +3105,10 @@ static bool llama_cache_init(
         ggml_context * ctx = it.second;
         ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
         if (!buf) {
+            if (!has_kv && !has_rs) {
+                // no buffer was needed, so this is fine
+                return true;
+            }
             LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__);
             return false;
         }