From 85679d37f34f66783cc04664a06c405b28e8e035 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Fri, 1 Nov 2024 00:49:53 +0100 Subject: [PATCH] llama : improve output buffer type selection (#10098) --- src/llama.cpp | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index ed3998a1f..ca0d259b2 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -17162,18 +17162,10 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) { auto * buft = ggml_backend_cpu_buffer_type(); // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory - ggml_tensor * output_tensor = lctx.model.output; - if (!output_tensor) { - // bert models don't have an output tensor, use the last layer - output_tensor = lctx.model.layers.back().layer_out_norm; - } - if (output_tensor) { - auto * output_buft = ggml_backend_buffer_get_type(output_tensor->buffer); - auto * output_dev = ggml_backend_buft_get_device(output_buft); - auto * output_dev_host_buft = ggml_backend_dev_host_buffer_type(output_dev); - if (output_dev_host_buft) { - buft = output_dev_host_buft; - } + auto * output_dev = lctx.model.dev_output.dev; + auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr; + if (output_dev_host_buft) { + buft = output_dev_host_buft; } lctx.buf_output = ggml_backend_buft_alloc_buffer(buft, new_size); if (lctx.buf_output == nullptr) {