llama : fix BERT inference without KV cache

2024-12-29 07:34:18 +01:00 · 2024-05-24 22:41:38 -04:00 · 2024-05-24 22:41:38 -04:00 · 61a88a1da3
commit 61a88a1da3
parent 0fd13e9473
1 changed files with 4 additions and 0 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -3105,6 +3105,10 @@ static bool llama_cache_init(
        ggml_context * ctx = it.second;
        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
        if (!buf) {
+            if (!has_kv && !has_rs) {
+                // no buffer was needed, so this is fine
+                return true;
+            }
            LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__);
            return false;
        }