llama : fix BERT inference without KV cache

This commit is contained in:
Francis Couture-Harpin 2024-05-24 22:41:38 -04:00
parent 0fd13e9473
commit 61a88a1da3

View File

@ -3105,6 +3105,10 @@ static bool llama_cache_init(
ggml_context * ctx = it.second;
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
if (!buf) {
if (!has_kv && !has_rs) {
// no buffer was needed, so this is fine
return true;
}
LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__);
return false;
}