mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-16 15:18:26 +01:00
kompute : initial attempt at ggml-backend v2 support
This commit is contained in:
parent
1eb8804c18
commit
d5670d6e46
181
ggml-kompute.cpp
181
ggml-kompute.cpp
@ -1,5 +1,7 @@
|
|||||||
#include "ggml-kompute.h"
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
#include "ggml-backend.h"
|
||||||
|
#include "ggml-backend-impl.h"
|
||||||
|
#include "ggml-kompute.h"
|
||||||
|
|
||||||
// These are generated at build time by cmake custom command
|
// These are generated at build time by cmake custom command
|
||||||
#include "shaderop_scale.h"
|
#include "shaderop_scale.h"
|
||||||
@ -488,16 +490,28 @@ void ggml_vk_free_memory(ggml_vk_memory &memory)
|
|||||||
}
|
}
|
||||||
|
|
||||||
static
|
static
|
||||||
decltype(ggml_kompute_context::buffers)::iterator ggml_vk_find_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t, uint64_t & offset) {
|
ggml_vk_memory * ggml_vk_find_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t, uint64_t & offset) {
|
||||||
|
// compatibility with ggml-backend
|
||||||
|
if (t->buffer && t->buffer->buft == ggml_backend_kompute_buffer_type()) {
|
||||||
|
ggml_vk_memory * buf_ctx = (ggml_vk_memory *) t->buffer->context;
|
||||||
|
|
||||||
|
const intptr_t ioffs = reinterpret_cast<intptr_t>(t->data) - reinterpret_cast<intptr_t>(buf_ctx->data);
|
||||||
|
|
||||||
|
GGML_ASSERT(ioffs >= 0 && ioffs + ggml_nbytes(t) <= (int64_t)t->buffer->size);
|
||||||
|
|
||||||
|
offset = (uint64_t)ioffs;
|
||||||
|
return buf_ctx;
|
||||||
|
}
|
||||||
|
|
||||||
for (auto it = ctx->buffers.begin(); ; it++) {
|
for (auto it = ctx->buffers.begin(); ; it++) {
|
||||||
if (it == ctx->buffers.end()) {
|
if (it == ctx->buffers.end()) {
|
||||||
fprintf(stderr, "%s: Failed to find tensor %p\n", __func__, t->data);
|
fprintf(stderr, "%s: Failed to find tensor %p\n", __func__, t->data);
|
||||||
return it;
|
return nullptr;
|
||||||
}
|
}
|
||||||
if (it->data <= t->data &&
|
if (it->data <= t->data &&
|
||||||
reinterpret_cast<intptr_t>(it->data) + it->size >= (reinterpret_cast<intptr_t>(t->data) + ggml_nbytes(t))) {
|
reinterpret_cast<intptr_t>(it->data) + it->size >= (reinterpret_cast<intptr_t>(t->data) + ggml_nbytes(t))) {
|
||||||
offset = reinterpret_cast<intptr_t>(t->data) - reinterpret_cast<intptr_t>(it->data);
|
offset = reinterpret_cast<intptr_t>(t->data) - reinterpret_cast<intptr_t>(it->data);
|
||||||
return it;
|
return &*it;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -505,8 +519,8 @@ decltype(ggml_kompute_context::buffers)::iterator ggml_vk_find_tensor(struct ggm
|
|||||||
static
|
static
|
||||||
const std::shared_ptr<kp::Tensor> ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t, uint32_t *alignedOffset) {
|
const std::shared_ptr<kp::Tensor> ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t, uint32_t *alignedOffset) {
|
||||||
uint64_t originalOffset = 0;
|
uint64_t originalOffset = 0;
|
||||||
auto res = ggml_vk_find_tensor(ctx, t, originalOffset);
|
auto * res = ggml_vk_find_tensor(ctx, t, originalOffset);
|
||||||
if (res == ctx->buffers.end()) {
|
if (!res) {
|
||||||
static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
|
static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
|
||||||
return nullTensor;
|
return nullTensor;
|
||||||
}
|
}
|
||||||
@ -1629,3 +1643,158 @@ kp::TensorT<uint8_t>::dataType()
|
|||||||
{
|
{
|
||||||
return TensorDataTypes::eUnsignedInt;
|
return TensorDataTypes::eUnsignedInt;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
// backend interface
|
||||||
|
|
||||||
|
static const char * ggml_backend_kompute_buffer_get_name(ggml_backend_buffer_t buffer) {
|
||||||
|
GGML_UNUSED(buffer);
|
||||||
|
return "Kompute";
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_kompute_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
|
auto * memory = (ggml_vk_memory *)buffer->context;
|
||||||
|
if (ggml_vk_has_device()) {
|
||||||
|
ggml_vk_free_memory(*memory);
|
||||||
|
}
|
||||||
|
delete memory;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||||
|
return ((ggml_vk_memory *)buffer->context)->data;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_kompute_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
|
memcpy((char *)tensor->data + offset, data, size);
|
||||||
|
ggml_vk_h2d_buffer(*(ggml_vk_memory *)buffer->context);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_kompute_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||||
|
ggml_vk_d2h_buffer(*(ggml_vk_memory *)buffer->context);
|
||||||
|
memcpy(data, (const char *)tensor->data + offset, size);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_kompute_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||||
|
auto * memory = (ggml_vk_memory *)buffer->context;
|
||||||
|
memset(memory->data, value, buffer->size);
|
||||||
|
ggml_vk_h2d_buffer(*memory);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ggml_backend_buffer_i ggml_backend_kompute_buffer_i = {
|
||||||
|
/* .get_name = */ ggml_backend_kompute_buffer_get_name,
|
||||||
|
/* .free_buffer = */ ggml_backend_kompute_buffer_free_buffer,
|
||||||
|
/* .get_base = */ ggml_backend_kompute_buffer_get_base,
|
||||||
|
/* .init_tensor = */ NULL,
|
||||||
|
/* .set_tensor = */ ggml_backend_kompute_buffer_set_tensor,
|
||||||
|
/* .get_tensor = */ ggml_backend_kompute_buffer_get_tensor,
|
||||||
|
/* .cpy_tensor_from = */ NULL,
|
||||||
|
/* .cpy_tensor_to = */ NULL,
|
||||||
|
/* .clear = */ ggml_backend_kompute_buffer_clear,
|
||||||
|
/* .reset = */ NULL,
|
||||||
|
};
|
||||||
|
|
||||||
|
// default buffer type
|
||||||
|
|
||||||
|
static const char * ggml_backend_kompute_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||||
|
GGML_UNUSED(buft);
|
||||||
|
return "Kompute";
|
||||||
|
}
|
||||||
|
|
||||||
|
static ggml_backend_buffer_t ggml_backend_kompute_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
|
auto * ctx = new ggml_vk_memory(ggml_vk_allocate(size));
|
||||||
|
return ggml_backend_buffer_init(buft, ggml_backend_kompute_buffer_i, ctx, size);
|
||||||
|
}
|
||||||
|
|
||||||
|
static size_t ggml_backend_kompute_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||||
|
GGML_UNUSED(buft);
|
||||||
|
return 32;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool ggml_backend_kompute_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
||||||
|
GGML_UNUSED(buft);
|
||||||
|
return ggml_backend_is_kompute(backend);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(void) {
|
||||||
|
static struct ggml_backend_buffer_type ggml_backend_buffer_type_kompute = {
|
||||||
|
/* .iface = */ {
|
||||||
|
/* .get_name = */ ggml_backend_kompute_buffer_type_get_name,
|
||||||
|
/* .alloc_buffer = */ ggml_backend_kompute_buffer_type_alloc_buffer,
|
||||||
|
/* .get_alignment = */ ggml_backend_kompute_buffer_type_get_alignment,
|
||||||
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||||
|
/* .supports_backend = */ ggml_backend_kompute_buffer_type_supports_backend,
|
||||||
|
/* .is_host = */ NULL,
|
||||||
|
},
|
||||||
|
/* .context = */ NULL,
|
||||||
|
};
|
||||||
|
|
||||||
|
return &ggml_backend_buffer_type_kompute;
|
||||||
|
}
|
||||||
|
|
||||||
|
// backend
|
||||||
|
|
||||||
|
static const char * ggml_backend_kompute_name(ggml_backend_t backend) {
|
||||||
|
GGML_UNUSED(backend);
|
||||||
|
return "Kompute";
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_kompute_free(ggml_backend_t backend) {
|
||||||
|
struct ggml_kompute_context * ctx = (struct ggml_kompute_context *)backend->context;
|
||||||
|
ggml_vk_free_device();
|
||||||
|
ggml_vk_free(ctx);
|
||||||
|
delete backend;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ggml_backend_buffer_type_t ggml_backend_kompute_get_default_buffer_type(ggml_backend_t backend) {
|
||||||
|
GGML_UNUSED(backend);
|
||||||
|
return ggml_backend_kompute_buffer_type();
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||||
|
auto * ctx = (ggml_kompute_context *)backend->context;
|
||||||
|
ggml_vk_graph_compute(ctx, cgraph);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
||||||
|
GGML_UNUSED(backend);
|
||||||
|
GGML_UNUSED(op);
|
||||||
|
return true; // TODO: implement
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct ggml_backend_i kompute_backend_i = {
|
||||||
|
/* .get_name = */ ggml_backend_kompute_name,
|
||||||
|
/* .free = */ ggml_backend_kompute_free,
|
||||||
|
/* .get_default_buffer_type = */ ggml_backend_kompute_get_default_buffer_type,
|
||||||
|
/* .set_tensor_async = */ NULL,
|
||||||
|
/* .get_tensor_async = */ NULL,
|
||||||
|
/* .cpy_tensor_from_async = */ NULL,
|
||||||
|
/* .cpy_tensor_to_async = */ NULL,
|
||||||
|
/* .synchronize = */ NULL,
|
||||||
|
/* .graph_plan_create = */ NULL,
|
||||||
|
/* .graph_plan_free = */ NULL,
|
||||||
|
/* .graph_plan_compute = */ NULL,
|
||||||
|
/* .graph_compute = */ ggml_backend_kompute_graph_compute,
|
||||||
|
/* .supports_op = */ ggml_backend_kompute_supports_op,
|
||||||
|
};
|
||||||
|
|
||||||
|
ggml_backend_t ggml_backend_kompute_init() {
|
||||||
|
if (!ggml_vk_has_device()) {
|
||||||
|
fprintf(stderr, "%s: error: device was not initialized\n", __func__);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_kompute_context * ctx = ggml_vk_init();
|
||||||
|
|
||||||
|
ggml_backend_t kompute_backend = new ggml_backend {
|
||||||
|
/* .interface = */ kompute_backend_i,
|
||||||
|
/* .context = */ ctx,
|
||||||
|
};
|
||||||
|
|
||||||
|
return kompute_backend;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ggml_backend_is_kompute(ggml_backend_t backend) {
|
||||||
|
return backend && backend->iface.get_name == ggml_backend_kompute_name;
|
||||||
|
}
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#include "ggml-backend.h"
|
||||||
|
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
@ -55,3 +57,17 @@ void ggml_vk_d2h_all(struct ggml_kompute_context * ctx);
|
|||||||
void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
|
void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
|
||||||
void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
|
void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
|
||||||
void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf);
|
void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf);
|
||||||
|
|
||||||
|
//
|
||||||
|
// backend API
|
||||||
|
// user-code should use only these functions
|
||||||
|
//
|
||||||
|
|
||||||
|
// forward declaration
|
||||||
|
typedef struct ggml_backend * ggml_backend_t;
|
||||||
|
|
||||||
|
GGML_API ggml_backend_t ggml_backend_kompute_init(void);
|
||||||
|
|
||||||
|
GGML_API bool ggml_backend_is_kompute(ggml_backend_t backend);
|
||||||
|
|
||||||
|
GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(void);
|
||||||
|
148
llama.cpp
148
llama.cpp
@ -760,63 +760,6 @@ static std::string llama_format_win_err(DWORD err) {
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// TODO(jared): remove this
|
|
||||||
struct llama_buffer {
|
|
||||||
void * data = NULL;
|
|
||||||
size_t size = 0;
|
|
||||||
#ifdef GGML_USE_KOMPUTE
|
|
||||||
ggml_vk_memory memory;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// fallback to malloc / free
|
|
||||||
// useful in cases where CUDA can try to allocate PINNED memory
|
|
||||||
bool fallback = false;
|
|
||||||
|
|
||||||
void resize(size_t n) {
|
|
||||||
llama_host_free(data);
|
|
||||||
|
|
||||||
#ifdef GGML_USE_KOMPUTE
|
|
||||||
if (ggml_vk_has_device()) {
|
|
||||||
this->memory = ggml_vk_allocate(n);
|
|
||||||
this->data = (uint8_t*)memory.data;
|
|
||||||
this->size = n;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
data = llama_host_malloc(n);
|
|
||||||
if (!data) {
|
|
||||||
fallback = true;
|
|
||||||
data = malloc(n);
|
|
||||||
} else {
|
|
||||||
fallback = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
GGML_ASSERT(data);
|
|
||||||
size = n;
|
|
||||||
}
|
|
||||||
|
|
||||||
~llama_buffer() {
|
|
||||||
if (data) {
|
|
||||||
#ifdef GGML_USE_KOMPUTE
|
|
||||||
if (memory.data) {
|
|
||||||
if (ggml_vk_has_device()) {
|
|
||||||
ggml_vk_free_memory(memory);
|
|
||||||
}
|
|
||||||
data = NULL;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
if (fallback) { // NOLINT
|
|
||||||
free(data);
|
|
||||||
} else {
|
|
||||||
llama_host_free(data);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
data = NULL;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
struct no_init {
|
struct no_init {
|
||||||
T value;
|
T value;
|
||||||
@ -1288,6 +1231,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
|
|||||||
buft = ggml_backend_cuda_buffer_type(gpu);
|
buft = ggml_backend_cuda_buffer_type(gpu);
|
||||||
#elif defined(GGML_USE_CLBLAST)
|
#elif defined(GGML_USE_CLBLAST)
|
||||||
buft = ggml_backend_opencl_buffer_type();
|
buft = ggml_backend_opencl_buffer_type();
|
||||||
|
#elif defined(GGML_USE_KOMPUTE)
|
||||||
|
buft = ggml_backend_kompute_buffer_type();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (buft == nullptr) {
|
if (buft == nullptr) {
|
||||||
@ -1721,11 +1666,6 @@ struct llama_context {
|
|||||||
// allocator for the input tensors
|
// allocator for the input tensors
|
||||||
ggml_tallocr * alloc = nullptr;
|
ggml_tallocr * alloc = nullptr;
|
||||||
|
|
||||||
// TODO(jared): remove this
|
|
||||||
#if defined(GGML_USE_KOMPUTE)
|
|
||||||
ggml_kompute_context * ctx_kompute = NULL;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// temporary buffer for copying data to/from the backend
|
// temporary buffer for copying data to/from the backend
|
||||||
std::vector<no_init<uint8_t>> buf_copy;
|
std::vector<no_init<uint8_t>> buf_copy;
|
||||||
|
|
||||||
@ -4362,10 +4302,6 @@ struct llm_build_context {
|
|||||||
|
|
||||||
std::vector<uint8_t> & buf_compute_meta;
|
std::vector<uint8_t> & buf_compute_meta;
|
||||||
|
|
||||||
#ifdef GGML_USE_KOMPUTE
|
|
||||||
ggml_kompute_context * ctx_kompute;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
struct ggml_context * ctx0 = nullptr;
|
struct ggml_context * ctx0 = nullptr;
|
||||||
|
|
||||||
// TODO: consider making the entire interface noexcept
|
// TODO: consider making the entire interface noexcept
|
||||||
@ -4405,10 +4341,6 @@ struct llm_build_context {
|
|||||||
do_rope_shift (worst_case || kv_self.has_shift),
|
do_rope_shift (worst_case || kv_self.has_shift),
|
||||||
cb (cb),
|
cb (cb),
|
||||||
buf_compute_meta (lctx.buf_compute_meta)
|
buf_compute_meta (lctx.buf_compute_meta)
|
||||||
// TODO(jared): remove this
|
|
||||||
#ifdef GGML_USE_KOMPUTE
|
|
||||||
, ctx_kompute (lctx.ctx_kompute)
|
|
||||||
#endif
|
|
||||||
{
|
{
|
||||||
// all initializations should be done in init()
|
// all initializations should be done in init()
|
||||||
}
|
}
|
||||||
@ -6028,11 +5960,6 @@ static struct ggml_cgraph * llama_build_graph(
|
|||||||
bool alloc_inp_KQ_mask = false;
|
bool alloc_inp_KQ_mask = false;
|
||||||
bool alloc_inp_K_shift = false;
|
bool alloc_inp_K_shift = false;
|
||||||
|
|
||||||
// TODO(jared): do we still need this?
|
|
||||||
#ifdef GGML_USE_KOMPUTE
|
|
||||||
const bool needs_h2d_all = lctx.ctx_kompute && !ggml_vk_has_h2d_all(lctx.ctx_kompute);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
|
// this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
|
||||||
// TODO: improve handling of input and output tensors, then replace this with ggml_set_name
|
// TODO: improve handling of input and output tensors, then replace this with ggml_set_name
|
||||||
llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
|
llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
|
||||||
@ -6149,22 +6076,6 @@ static struct ggml_cgraph * llama_build_graph(
|
|||||||
|
|
||||||
alloc_inp_K_shift = true;
|
alloc_inp_K_shift = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO(jared): this shouldn't be needed anymore
|
|
||||||
#ifdef GGML_USE_KOMPUTE
|
|
||||||
if (lctx.ctx_kompute && !needs_h2d_all) {
|
|
||||||
const char * offload_tensors[] = {"inp_tokens", "inp_pos", "KQ_mask", "K_shift"};
|
|
||||||
for (auto off : offload_tensors) {
|
|
||||||
if (strcmp(name, off) == 0) {
|
|
||||||
ggml_vk_h2d_tensor(lctx.ctx_kompute, cur);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (strcmp(name, "inp_embd") == 0 && !batch.token) {
|
|
||||||
ggml_vk_h2d_tensor(lctx.ctx_kompute, cur);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_cgraph * result = NULL;
|
struct ggml_cgraph * result = NULL;
|
||||||
@ -6230,12 +6141,6 @@ static struct ggml_cgraph * llama_build_graph(
|
|||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_USE_KOMPUTE
|
|
||||||
if (needs_h2d_all) {
|
|
||||||
ggml_vk_h2d_all(lctx.ctx_kompute);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
llm.free();
|
llm.free();
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
@ -6374,25 +6279,6 @@ static int llama_decode_internal(
|
|||||||
if (ggml_backend_is_metal(lctx.backend_metal)) {
|
if (ggml_backend_is_metal(lctx.backend_metal)) {
|
||||||
ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
|
ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
|
||||||
}
|
}
|
||||||
#elif defined(GGML_USE_KOMPUTE)
|
|
||||||
if (lctx.ctx_kompute && n_tokens == 1) {
|
|
||||||
ggml_vk_graph_compute(lctx.ctx_kompute, gf);
|
|
||||||
ggml_vk_d2h_tensor(lctx.ctx_kompute, res);
|
|
||||||
} else {
|
|
||||||
if (lctx.ctx_kompute) {
|
|
||||||
for (int il = 0; il < hparams.n_layer; ++il) {
|
|
||||||
ggml_vk_d2h_tensor(lctx.ctx_kompute, kv_self.k_l[il]);
|
|
||||||
ggml_vk_d2h_tensor(lctx.ctx_kompute, kv_self.v_l[il]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
|
|
||||||
if (lctx.ctx_kompute) {
|
|
||||||
for (int il = 0; il < hparams.n_layer; ++il) {
|
|
||||||
ggml_vk_h2d_tensor(lctx.ctx_kompute, kv_self.k_l[il]);
|
|
||||||
ggml_vk_h2d_tensor(lctx.ctx_kompute, kv_self.v_l[il]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (lctx.backend_cpu != nullptr) {
|
if (lctx.backend_cpu != nullptr) {
|
||||||
@ -9446,6 +9332,16 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#elif defined(GGML_USE_KOMPUTE)
|
||||||
|
if (ggml_vk_has_device() && model->n_gpu_layers > 0) {
|
||||||
|
auto * backend = ggml_backend_kompute_init();
|
||||||
|
if (backend == nullptr) {
|
||||||
|
LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
|
||||||
|
llama_free(ctx);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
ctx->backends.push_back(backend);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
ctx->backend_cpu = ggml_backend_cpu_init();
|
ctx->backend_cpu = ggml_backend_cpu_init();
|
||||||
if (ctx->backend_cpu == nullptr) {
|
if (ctx->backend_cpu == nullptr) {
|
||||||
@ -9518,23 +9414,6 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
|
ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO(jared): remove this
|
|
||||||
#if defined(GGML_USE_KOMPUTE)
|
|
||||||
if (ggml_vk_has_device() && model->n_gpu_layers > 0) {
|
|
||||||
// this allocates all Vulkan resources and memory buffers
|
|
||||||
ctx->ctx_kompute = ggml_vk_init();
|
|
||||||
|
|
||||||
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
|
|
||||||
|
|
||||||
printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
|
|
||||||
|
|
||||||
ggml_vk_add_buffer(ctx->ctx_kompute, "data", ctx->model.buf.memory);
|
|
||||||
ggml_vk_add_buffer(ctx->ctx_kompute, "eval", ctx->buf_compute.memory);
|
|
||||||
ggml_vk_add_buffer(ctx->ctx_kompute, "kv", ctx->kv_self.buf.memory);
|
|
||||||
ggml_vk_add_buffer(ctx->ctx_kompute, "alloc", ctx->buf_alloc.memory);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_USE_MPI
|
#ifdef GGML_USE_MPI
|
||||||
@ -9555,9 +9434,6 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
}
|
}
|
||||||
|
|
||||||
void llama_free(struct llama_context * ctx) {
|
void llama_free(struct llama_context * ctx) {
|
||||||
#ifdef GGML_USE_KOMPUTE
|
|
||||||
ggml_vk_free(ctx->ctx_kompute);
|
|
||||||
#endif
|
|
||||||
delete ctx;
|
delete ctx;
|
||||||
#ifdef GGML_USE_KOMPUTE
|
#ifdef GGML_USE_KOMPUTE
|
||||||
ggml_vk_free_device();
|
ggml_vk_free_device();
|
||||||
|
Loading…
Reference in New Issue
Block a user