mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-27 06:39:25 +01:00
llama : improve output buffer type selection (#10098)
This commit is contained in:
parent
1e9f94994e
commit
85679d37f3
@ -17162,19 +17162,11 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
|
||||
|
||||
auto * buft = ggml_backend_cpu_buffer_type();
|
||||
// try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
|
||||
ggml_tensor * output_tensor = lctx.model.output;
|
||||
if (!output_tensor) {
|
||||
// bert models don't have an output tensor, use the last layer
|
||||
output_tensor = lctx.model.layers.back().layer_out_norm;
|
||||
}
|
||||
if (output_tensor) {
|
||||
auto * output_buft = ggml_backend_buffer_get_type(output_tensor->buffer);
|
||||
auto * output_dev = ggml_backend_buft_get_device(output_buft);
|
||||
auto * output_dev_host_buft = ggml_backend_dev_host_buffer_type(output_dev);
|
||||
auto * output_dev = lctx.model.dev_output.dev;
|
||||
auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr;
|
||||
if (output_dev_host_buft) {
|
||||
buft = output_dev_host_buft;
|
||||
}
|
||||
}
|
||||
lctx.buf_output = ggml_backend_buft_alloc_buffer(buft, new_size);
|
||||
if (lctx.buf_output == nullptr) {
|
||||
LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
|
||||
|
Loading…
Reference in New Issue
Block a user