mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-22 17:49:10 +01:00
386 lines
14 KiB
C
386 lines
14 KiB
C
|
#include "ggml-backend.h"
|
||
|
#include "ggml-alloc.h"
|
||
|
|
||
|
#include <assert.h>
|
||
|
#include <stdarg.h>
|
||
|
#include <stdio.h>
|
||
|
#include <stdlib.h>
|
||
|
#include <string.h>
|
||
|
|
||
|
#define UNUSED GGML_UNUSED
|
||
|
|
||
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||
|
|
||
|
// backend buffer
|
||
|
|
||
|
ggml_backend_buffer_t ggml_backend_buffer_init(
|
||
|
struct ggml_backend * backend,
|
||
|
struct ggml_backend_buffer_i iface,
|
||
|
ggml_backend_buffer_context_t context,
|
||
|
size_t size) {
|
||
|
ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
|
||
|
|
||
|
GGML_ASSERT(iface.get_base != NULL);
|
||
|
|
||
|
(*buffer) = (struct ggml_backend_buffer) {
|
||
|
/* .interface = */ iface,
|
||
|
/* .backend = */ backend,
|
||
|
/* .context = */ context,
|
||
|
/* .size = */ size,
|
||
|
};
|
||
|
|
||
|
return buffer;
|
||
|
}
|
||
|
|
||
|
void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
||
|
if (buffer->iface.free_buffer != NULL) {
|
||
|
buffer->iface.free_buffer(buffer);
|
||
|
}
|
||
|
free(buffer);
|
||
|
}
|
||
|
|
||
|
size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
|
||
|
return ggml_backend_get_alignment(buffer->backend);
|
||
|
}
|
||
|
|
||
|
void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||
|
return buffer->iface.get_base(buffer);
|
||
|
}
|
||
|
|
||
|
size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
|
||
|
return buffer->size;
|
||
|
}
|
||
|
|
||
|
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||
|
if (buffer->iface.get_alloc_size) {
|
||
|
return buffer->iface.get_alloc_size(buffer, tensor);
|
||
|
}
|
||
|
return ggml_nbytes(tensor);
|
||
|
}
|
||
|
|
||
|
void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||
|
if (buffer->iface.init_tensor) {
|
||
|
buffer->iface.init_tensor(buffer, tensor);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void ggml_backend_buffer_free_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||
|
if (buffer->iface.free_tensor) {
|
||
|
buffer->iface.free_tensor(buffer, tensor);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// backend
|
||
|
|
||
|
ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor) {
|
||
|
return tensor->buffer->backend;
|
||
|
}
|
||
|
|
||
|
const char * ggml_backend_name(ggml_backend_t backend) {
|
||
|
return backend->iface.get_name(backend);
|
||
|
}
|
||
|
|
||
|
void ggml_backend_free(ggml_backend_t backend) {
|
||
|
backend->iface.free(backend);
|
||
|
}
|
||
|
|
||
|
ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
|
||
|
return backend->iface.alloc_buffer(backend, size);
|
||
|
}
|
||
|
|
||
|
size_t ggml_backend_get_alignment(ggml_backend_t backend) {
|
||
|
return backend->iface.get_alignment(backend);
|
||
|
}
|
||
|
|
||
|
void ggml_backend_tensor_set_async(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||
|
ggml_get_backend(tensor)->iface.set_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
|
||
|
}
|
||
|
|
||
|
void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||
|
ggml_get_backend(tensor)->iface.get_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
|
||
|
}
|
||
|
|
||
|
void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||
|
ggml_get_backend(tensor)->iface.set_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
|
||
|
ggml_get_backend(tensor)->iface.synchronize(ggml_get_backend(tensor));
|
||
|
}
|
||
|
|
||
|
void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||
|
ggml_get_backend(tensor)->iface.get_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
|
||
|
ggml_get_backend(tensor)->iface.synchronize(ggml_get_backend(tensor));
|
||
|
}
|
||
|
|
||
|
void ggml_backend_synchronize(ggml_backend_t backend) {
|
||
|
backend->iface.synchronize(backend);
|
||
|
}
|
||
|
|
||
|
ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||
|
return backend->iface.graph_plan_create(backend, cgraph);
|
||
|
}
|
||
|
|
||
|
void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||
|
backend->iface.graph_plan_free(backend, plan);
|
||
|
}
|
||
|
|
||
|
void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||
|
backend->iface.graph_plan_compute(backend, plan);
|
||
|
}
|
||
|
|
||
|
void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||
|
backend->iface.graph_compute(backend, cgraph);
|
||
|
}
|
||
|
|
||
|
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
||
|
return backend->iface.supports_op(backend, op);
|
||
|
}
|
||
|
|
||
|
// backend copy
|
||
|
|
||
|
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
||
|
if (a->type != b->type) {
|
||
|
return false;
|
||
|
}
|
||
|
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||
|
if (a->ne[i] != b->ne[i]) {
|
||
|
return false;
|
||
|
}
|
||
|
if (a->nb[i] != b->nb[i]) {
|
||
|
return false;
|
||
|
}
|
||
|
}
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||
|
//printf("src: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", src->name, (int)src->ne[0], (int)src->ne[1], (int)src->ne[2], (int)src->ne[3], (int)src->nb[0], (int)src->nb[1], (int)src->nb[2], (int)src->nb[3]);
|
||
|
//printf("dst: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", dst->name, (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], (int)dst->nb[0], (int)dst->nb[1], (int)dst->nb[2], (int)dst->nb[3]);
|
||
|
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
|
||
|
|
||
|
// printf("cpy tensor %s from %s to %s (%lu bytes)\n", src->name, ggml_backend_name(src->backend), ggml_backend_name(dst->backend), ggml_nbytes(src));
|
||
|
|
||
|
if (src == dst) {
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
// TODO: allow backends to support copy to/from same backend
|
||
|
|
||
|
if (ggml_get_backend(dst)->iface.cpy_tensor_from != NULL) {
|
||
|
ggml_get_backend(dst)->iface.cpy_tensor_from(ggml_get_backend(dst)->context, src, dst);
|
||
|
} else if (ggml_get_backend(src)->iface.cpy_tensor_to != NULL) {
|
||
|
ggml_get_backend(src)->iface.cpy_tensor_to(ggml_get_backend(src)->context, src, dst);
|
||
|
} else {
|
||
|
// shouldn't be hit when copying from/to CPU
|
||
|
#ifndef NDEBUG
|
||
|
fprintf(stderr, "ggml_backend_tensor_copy: neither cpy_tensor_from nor cpy_tensor_to are implemented for backends %s and %s, falling back to get/set\n", ggml_backend_name(src->buffer->backend), ggml_backend_name(dst->buffer->backend));
|
||
|
#endif
|
||
|
size_t nbytes = ggml_nbytes(src);
|
||
|
void * data = malloc(nbytes);
|
||
|
ggml_backend_tensor_get(src, data, 0, nbytes);
|
||
|
ggml_backend_tensor_set(dst, data, 0, nbytes);
|
||
|
free(data);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// backend CPU
|
||
|
|
||
|
struct ggml_backend_cpu_context {
|
||
|
int n_threads;
|
||
|
void * work_data;
|
||
|
size_t work_size;
|
||
|
};
|
||
|
|
||
|
static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
|
||
|
return "CPU";
|
||
|
|
||
|
UNUSED(backend);
|
||
|
}
|
||
|
|
||
|
static void ggml_backend_cpu_free(ggml_backend_t backend) {
|
||
|
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
||
|
free(cpu_ctx->work_data);
|
||
|
free(cpu_ctx);
|
||
|
free(backend);
|
||
|
}
|
||
|
|
||
|
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||
|
return (void *)buffer->context;
|
||
|
}
|
||
|
|
||
|
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||
|
free(buffer->context);
|
||
|
UNUSED(buffer);
|
||
|
}
|
||
|
|
||
|
static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
|
||
|
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
|
||
|
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
||
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||
|
/* .init_tensor = */ NULL, // no initialization required
|
||
|
/* .free_tensor = */ NULL, // no cleanup required
|
||
|
};
|
||
|
|
||
|
// for buffers from ptr, free is not called
|
||
|
static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
|
||
|
/* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
|
||
|
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
||
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||
|
/* .init_tensor = */ NULL,
|
||
|
/* .free_tensor = */ NULL,
|
||
|
};
|
||
|
|
||
|
static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
|
||
|
|
||
|
static ggml_backend_buffer_t ggml_backend_cpu_alloc_buffer(ggml_backend_t backend, size_t size) {
|
||
|
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
|
||
|
void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
|
||
|
|
||
|
return ggml_backend_buffer_init(backend, cpu_backend_buffer_i, data, size);
|
||
|
}
|
||
|
|
||
|
static size_t ggml_backend_cpu_get_alignment(ggml_backend_t backend) {
|
||
|
return TENSOR_ALIGNMENT;
|
||
|
UNUSED(backend);
|
||
|
}
|
||
|
|
||
|
static void ggml_backend_cpu_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
||
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
||
|
|
||
|
memcpy((char *)tensor->data + offset, data, size);
|
||
|
|
||
|
UNUSED(backend);
|
||
|
}
|
||
|
|
||
|
static void ggml_backend_cpu_get_tensor_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
||
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
||
|
|
||
|
memcpy(data, (const char *)tensor->data + offset, size);
|
||
|
|
||
|
UNUSED(backend);
|
||
|
}
|
||
|
|
||
|
static void ggml_backend_cpu_synchronize(ggml_backend_t backend) {
|
||
|
UNUSED(backend);
|
||
|
}
|
||
|
|
||
|
static void ggml_backend_cpu_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||
|
ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
|
||
|
|
||
|
UNUSED(backend);
|
||
|
}
|
||
|
|
||
|
static void ggml_backend_cpu_cpy_tensor_to(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||
|
// for a backend such as CUDA that can queue async calls, it is ok to do this asynchronously, but it may not be the case for other backends
|
||
|
ggml_backend_tensor_set_async(dst, src->data, 0, ggml_nbytes(src));
|
||
|
|
||
|
UNUSED(backend);
|
||
|
}
|
||
|
|
||
|
struct ggml_backend_plan_cpu {
|
||
|
struct ggml_cplan cplan;
|
||
|
struct ggml_cgraph cgraph;
|
||
|
};
|
||
|
|
||
|
static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||
|
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
||
|
|
||
|
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
|
||
|
|
||
|
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
||
|
cpu_plan->cgraph = *cgraph;
|
||
|
|
||
|
if (cpu_plan->cplan.work_size > 0) {
|
||
|
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
|
||
|
}
|
||
|
|
||
|
return cpu_plan;
|
||
|
}
|
||
|
|
||
|
static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||
|
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
||
|
|
||
|
free(cpu_plan->cplan.work_data);
|
||
|
free(cpu_plan);
|
||
|
|
||
|
UNUSED(backend);
|
||
|
}
|
||
|
|
||
|
static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||
|
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
||
|
|
||
|
ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
|
||
|
|
||
|
UNUSED(backend);
|
||
|
}
|
||
|
|
||
|
static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||
|
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
||
|
|
||
|
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
||
|
|
||
|
if (cpu_ctx->work_size < cplan.work_size) {
|
||
|
// TODO: may be faster to free and use malloc to avoid the copy
|
||
|
cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size);
|
||
|
cpu_ctx->work_size = cplan.work_size;
|
||
|
}
|
||
|
|
||
|
cplan.work_data = cpu_ctx->work_data;
|
||
|
|
||
|
ggml_graph_compute(cgraph, &cplan);
|
||
|
}
|
||
|
|
||
|
static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
||
|
return true;
|
||
|
UNUSED(backend);
|
||
|
UNUSED(op);
|
||
|
}
|
||
|
|
||
|
static struct ggml_backend_i cpu_backend_i = {
|
||
|
/* .get_name = */ ggml_backend_cpu_name,
|
||
|
/* .free = */ ggml_backend_cpu_free,
|
||
|
/* .alloc_buffer = */ ggml_backend_cpu_alloc_buffer,
|
||
|
/* .get_alignment = */ ggml_backend_cpu_get_alignment,
|
||
|
/* .set_tensor_async = */ ggml_backend_cpu_set_tensor_async,
|
||
|
/* .get_tensor_async = */ ggml_backend_cpu_get_tensor_async,
|
||
|
/* .synchronize = */ ggml_backend_cpu_synchronize,
|
||
|
/* .cpy_tensor_from = */ ggml_backend_cpu_cpy_tensor_from,
|
||
|
/* .cpy_tensor_to = */ ggml_backend_cpu_cpy_tensor_to,
|
||
|
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
|
||
|
/* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
|
||
|
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
||
|
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
||
|
/* .supports_op = */ ggml_backend_cpu_supports_op,
|
||
|
};
|
||
|
|
||
|
ggml_backend_t ggml_backend_cpu_init(void) {
|
||
|
struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
|
||
|
|
||
|
ctx->n_threads = GGML_DEFAULT_N_THREADS;
|
||
|
ctx->work_data = NULL;
|
||
|
ctx->work_size = 0;
|
||
|
|
||
|
ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
|
||
|
|
||
|
*cpu_backend = (struct ggml_backend) {
|
||
|
/* .interface = */ cpu_backend_i,
|
||
|
/* .context = */ ctx
|
||
|
};
|
||
|
return cpu_backend;
|
||
|
}
|
||
|
|
||
|
bool ggml_backend_is_cpu(ggml_backend_t backend) {
|
||
|
return backend->iface.get_name == ggml_backend_cpu_name;
|
||
|
}
|
||
|
|
||
|
void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
|
||
|
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
||
|
|
||
|
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
||
|
ctx->n_threads = n_threads;
|
||
|
}
|
||
|
|
||
|
ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size) {
|
||
|
return ggml_backend_buffer_init(backend_cpu, cpu_backend_buffer_i_from_ptr, ptr, size);
|
||
|
}
|