mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-05 18:44:51 +01:00
automatically calculate compute buffer sizes (without graph allocator)
This commit is contained in:
parent
77ac8deaf1
commit
cb205c0d13
@ -15,16 +15,20 @@ static size_t aligned_offset(const void * buffer, size_t offset, size_t alignmen
|
|||||||
return offset + align;
|
return offset + align;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline size_t ggml_backend_buffer_get_alloc_size(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) { return alloc->interface.get_alloc_size(alloc, tensor); }
|
static inline size_t ggml_backend_buffer_get_alloc_size(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) {
|
||||||
static inline void ggml_backend_buffer_init_tensor(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) { alloc->interface.init_tensor(alloc, tensor); }
|
return alloc->interface.get_alloc_size(alloc, tensor);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void ggml_backend_buffer_init_tensor(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) {
|
||||||
|
alloc->interface.init_tensor(alloc, tensor);
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_backend_buffer_free(struct ggml_backend_buffer * alloc) {
|
void ggml_backend_buffer_free(struct ggml_backend_buffer * alloc) {
|
||||||
alloc->interface.free_buffer(alloc);
|
alloc->interface.free_buffer(alloc);
|
||||||
free(alloc);
|
free(alloc);
|
||||||
}
|
}
|
||||||
|
|
||||||
// backend buffer allocator - simple
|
// backend buffer allocator - simple - cannot free tensors, good for weights and small contexts
|
||||||
|
|
||||||
struct ggml_allocator_simple_context {
|
struct ggml_allocator_simple_context {
|
||||||
void * data;
|
void * data;
|
||||||
@ -38,21 +42,32 @@ static void ggml_allocator_simple_free_buffer(struct ggml_backend_buffer * alloc
|
|||||||
free(context);
|
free(context);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||||
|
|
||||||
static void ggml_allocator_simple_alloc_tensor(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) {
|
static void ggml_allocator_simple_alloc_tensor(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) {
|
||||||
struct ggml_allocator_simple_context * context = (struct ggml_allocator_simple_context *)alloc->context;
|
struct ggml_allocator_simple_context * context = (struct ggml_allocator_simple_context *)alloc->context;
|
||||||
|
|
||||||
size_t size = ggml_backend_buffer_get_alloc_size(alloc, tensor);
|
size_t size = ggml_backend_buffer_get_alloc_size(alloc, tensor);
|
||||||
if (context->offset + size > context->size) {
|
|
||||||
|
if (!alloc->measure && context->offset + size > context->size) {
|
||||||
fprintf(stderr, "%s: not enough space in the buffer (needed %zu, available %zu)\n",
|
fprintf(stderr, "%s: not enough space in the buffer (needed %zu, available %zu)\n",
|
||||||
__func__, size, context->size - context->offset);
|
__func__, size, context->size - context->offset);
|
||||||
GGML_ASSERT(!"not enough space in the buffer");
|
GGML_ASSERT(!"not enough space in the buffer");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
void * ptr = (char*)context->data + context->offset;
|
|
||||||
context->offset = aligned_offset(context->data, context->offset + size, context->alignment);
|
alloc->max_size = MAX(alloc->max_size, context->offset + size);
|
||||||
tensor->data = ptr;
|
|
||||||
if (alloc->interface.init_tensor) {
|
if (alloc->measure) {
|
||||||
alloc->interface.init_tensor(alloc, tensor);
|
tensor->data = NULL;
|
||||||
|
} else {
|
||||||
|
tensor->data = (char*)context->data + context->offset;
|
||||||
|
if (alloc->interface.init_tensor) {
|
||||||
|
ggml_backend_buffer_init_tensor(alloc, tensor);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
context->offset = aligned_offset(context->data, context->offset + size, context->alignment);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_allocator_simple_free_tensor(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) {
|
static void ggml_allocator_simple_free_tensor(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) {
|
||||||
@ -83,7 +98,7 @@ static const struct ggml_backend_buffer_interface ggml_allocator_simple_interfac
|
|||||||
/* .free_data = */ NULL,
|
/* .free_data = */ NULL,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_backend_buffer * ggml_allocator_simple_init(void * data, size_t size, size_t alignment) {
|
static struct ggml_backend_buffer * ggml_allocator_simple_init(void * data, size_t size, size_t alignment) {
|
||||||
struct ggml_allocator_simple_context * ctx = malloc(sizeof(struct ggml_allocator_simple_context));
|
struct ggml_allocator_simple_context * ctx = malloc(sizeof(struct ggml_allocator_simple_context));
|
||||||
ctx->data = data;
|
ctx->data = data;
|
||||||
ctx->size = size;
|
ctx->size = size;
|
||||||
@ -94,20 +109,35 @@ struct ggml_backend_buffer * ggml_allocator_simple_init(void * data, size_t size
|
|||||||
*allocator = (struct ggml_backend_buffer){
|
*allocator = (struct ggml_backend_buffer){
|
||||||
/* .interface = */ ggml_allocator_simple_interface,
|
/* .interface = */ ggml_allocator_simple_interface,
|
||||||
/* .context = */ ctx,
|
/* .context = */ ctx,
|
||||||
|
/* .backend = */ NULL,
|
||||||
/* .backend_data = */ NULL,
|
/* .backend_data = */ NULL,
|
||||||
|
/* .measure = */ false,
|
||||||
|
/* .max_size = */ 0,
|
||||||
};
|
};
|
||||||
return allocator;
|
return allocator;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
|
||||||
|
struct ggml_backend_buffer * ggml_allocator_default_init(void * data, size_t size, size_t alignment) {
|
||||||
|
return ggml_allocator_simple_init(data, size, alignment);
|
||||||
|
}
|
||||||
|
|
||||||
// buffer
|
// buffer
|
||||||
|
|
||||||
struct ggml_buffer * ggml_buffer_alloc(struct ggml_backend * backend, size_t size, size_t max_tensors) {
|
struct ggml_buffer * ggml_buffer_alloc(struct ggml_backend * backend, size_t size, size_t max_tensors) {
|
||||||
struct ggml_buffer * buffer = malloc(sizeof(struct ggml_buffer));
|
struct ggml_buffer * buffer = malloc(sizeof(struct ggml_buffer));
|
||||||
buffer->mem_size = ggml_tensor_overhead() * max_tensors;
|
buffer->mem_size = ggml_tensor_overhead() * max_tensors;
|
||||||
buffer->mem_buffer = malloc(buffer->mem_size);
|
buffer->mem_buffer = malloc(buffer->mem_size);
|
||||||
buffer->backend = backend;
|
|
||||||
size += 128 * max_tensors; // alignment overhead
|
size += 128 * max_tensors; // alignment overhead
|
||||||
buffer->backend_buffer = backend->interface.alloc_buffer(backend, size);
|
buffer->backend_buffer = backend->interface.alloc_buffer(backend, size);
|
||||||
|
buffer->backend_buffer->backend = backend;
|
||||||
|
return buffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_buffer * ggml_buffer_measure_alloc(struct ggml_backend * backend, size_t max_tensors) {
|
||||||
|
struct ggml_buffer * buffer = ggml_buffer_alloc(backend, 0, max_tensors);
|
||||||
|
buffer->backend_buffer->measure = true;
|
||||||
return buffer;
|
return buffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -190,7 +220,7 @@ static void ggml_backend_cpu_free_buffer(struct ggml_backend_buffer * alloc) {
|
|||||||
static struct ggml_backend_buffer * ggml_backend_cpu_alloc_buffer(struct ggml_backend * backend, size_t size) {
|
static struct ggml_backend_buffer * ggml_backend_cpu_alloc_buffer(struct ggml_backend * backend, size_t size) {
|
||||||
void * data = malloc(size);
|
void * data = malloc(size);
|
||||||
|
|
||||||
struct ggml_backend_buffer * buffer = ggml_allocator_simple_init(data, size, TENSOR_ALIGNMENT);
|
struct ggml_backend_buffer * buffer = ggml_allocator_default_init(data, size, TENSOR_ALIGNMENT);
|
||||||
buffer->interface.free_data = ggml_backend_cpu_free_buffer;
|
buffer->interface.free_data = ggml_backend_cpu_free_buffer;
|
||||||
buffer->backend_data = data;
|
buffer->backend_data = data;
|
||||||
|
|
||||||
@ -674,3 +704,33 @@ void allocate_graph(struct ggml_cgraph * gf, struct ggml_buffer * buffer) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
void ggml_graph_allocate_tensors(struct ggml_cgraph * graph) {
|
||||||
|
ggml_graph_allocate_tensors_n(&graph, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_graph_allocate_tensors_n(struct ggml_cgraph ** graphs, int n_graphs) {
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void ggml_graph_splits_allocate_tensors(struct ggml_graph_splits * splits) {
|
||||||
|
bool visited[GGML_MAX_SPLITS] = {false};
|
||||||
|
for (int i = 0; i < splits->n_splits; i++) {
|
||||||
|
if (!visited[i]) {
|
||||||
|
struct ggml_graph_split * split = &splits->splits[i];
|
||||||
|
struct ggml_backend * backend = split->dst_inputs[0]->backend; // not great
|
||||||
|
struct ggml_cgraph * backend_graphs[GGML_MAX_SPLITS];
|
||||||
|
int num_graphs = 0;
|
||||||
|
for (int j = i; j < splits->n_splits; j++) {
|
||||||
|
if (splits->splits[j].dst_inputs[0]->backend == backend) {
|
||||||
|
backend_graphs[num_graphs++] = splits->splits[j].graph;
|
||||||
|
visited[j] = true;
|
||||||
|
// TODO: need to ensure that the output tensors are never freed
|
||||||
|
// maybe this can be done automatically in ggml_graph_calc_compute_buffer_size by assuming that n_childs == 0 => output tensor
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ggml_graph_allocate_tensors_n(backend_graphs, num_graphs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@ -7,8 +7,7 @@ extern "C" {
|
|||||||
#endif
|
#endif
|
||||||
struct ggml_backend;
|
struct ggml_backend;
|
||||||
|
|
||||||
|
// backend buffer
|
||||||
// backend buffers
|
|
||||||
typedef void * ggml_buffer_context_t;
|
typedef void * ggml_buffer_context_t;
|
||||||
struct ggml_backend_buffer;
|
struct ggml_backend_buffer;
|
||||||
|
|
||||||
@ -27,7 +26,10 @@ extern "C" {
|
|||||||
struct ggml_backend_buffer {
|
struct ggml_backend_buffer {
|
||||||
struct ggml_backend_buffer_interface interface;
|
struct ggml_backend_buffer_interface interface;
|
||||||
ggml_buffer_context_t context;
|
ggml_buffer_context_t context;
|
||||||
|
struct ggml_backend * backend;
|
||||||
void * backend_data;
|
void * backend_data;
|
||||||
|
bool measure;
|
||||||
|
size_t max_size;
|
||||||
};
|
};
|
||||||
|
|
||||||
// backend buffer helper functions
|
// backend buffer helper functions
|
||||||
@ -36,11 +38,8 @@ extern "C" {
|
|||||||
static inline void ggml_backend_buffer_free_tensor(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) { alloc->interface.free_tensor(alloc, tensor); }
|
static inline void ggml_backend_buffer_free_tensor(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) { alloc->interface.free_tensor(alloc, tensor); }
|
||||||
static inline void ggml_backend_buffer_reset(struct ggml_backend_buffer * alloc) { alloc->interface.reset(alloc); }
|
static inline void ggml_backend_buffer_reset(struct ggml_backend_buffer * alloc) { alloc->interface.reset(alloc); }
|
||||||
|
|
||||||
// default buffer allocators
|
// default buffer allocator
|
||||||
// simple buffer allocator: cannot free tensors, good for weights and small contexts
|
GGML_API struct ggml_backend_buffer * ggml_allocator_default_init(void * data, size_t size, size_t alignment);
|
||||||
// default buffer allocator: can free tensors, good for compute contexts
|
|
||||||
GGML_API struct ggml_backend_buffer * ggml_allocator_simple_init(void * data, size_t size, size_t alignment);
|
|
||||||
GGML_API struct ggml_backend_buffer * ggml_allocator_default_init(void * data, size_t size, size_t alignment, int max_free_blocks);
|
|
||||||
|
|
||||||
// buffer
|
// buffer
|
||||||
|
|
||||||
@ -51,11 +50,12 @@ extern "C" {
|
|||||||
void * mem_buffer;
|
void * mem_buffer;
|
||||||
|
|
||||||
// tensor data
|
// tensor data
|
||||||
struct ggml_backend * backend;
|
|
||||||
struct ggml_backend_buffer * backend_buffer;
|
struct ggml_backend_buffer * backend_buffer;
|
||||||
};
|
};
|
||||||
|
|
||||||
GGML_API struct ggml_buffer * ggml_buffer_alloc(struct ggml_backend * backend, size_t size, size_t max_tensors);
|
GGML_API struct ggml_buffer * ggml_buffer_alloc (struct ggml_backend * backend, size_t size, size_t max_tensors);
|
||||||
|
GGML_API struct ggml_buffer * ggml_buffer_measure_alloc(struct ggml_backend * backend, size_t max_tensors);
|
||||||
|
// measure buffers only calculate the maximum size of the buffer without allocating it - useful for pre-allocation
|
||||||
GGML_API void ggml_buffer_free(struct ggml_buffer * buffer);
|
GGML_API void ggml_buffer_free(struct ggml_buffer * buffer);
|
||||||
|
|
||||||
// backend
|
// backend
|
||||||
@ -152,6 +152,11 @@ extern "C" {
|
|||||||
// compute
|
// compute
|
||||||
GGML_API void ggml_graph_splits_compute(struct ggml_graph_splits * splits);
|
GGML_API void ggml_graph_splits_compute(struct ggml_graph_splits * splits);
|
||||||
|
|
||||||
|
// graph tensor allocator
|
||||||
|
GGML_API void ggml_graph_allocate_tensors(struct ggml_cgraph * graph);
|
||||||
|
GGML_API void ggml_graph_allocate_tensors_n(struct ggml_cgraph ** graphs, int n_graphs);
|
||||||
|
GGML_API void ggml_graph_splits_allocate_tensors(struct ggml_graph_splits * splits);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -1726,7 +1726,7 @@ static ggml_backend_buffer * ggml_backend_cuda_alloc_buffer(ggml_backend * backe
|
|||||||
void * data;
|
void * data;
|
||||||
CUDA_CHECK(cudaMalloc(&data, size));
|
CUDA_CHECK(cudaMalloc(&data, size));
|
||||||
|
|
||||||
ggml_backend_buffer * buffer = ggml_allocator_simple_init(data, size, TENSOR_ALIGNMENT);
|
ggml_backend_buffer * buffer = ggml_allocator_default_init(data, size, TENSOR_ALIGNMENT);
|
||||||
buffer->interface.free_data = ggml_backend_cuda_free_buffer;
|
buffer->interface.free_data = ggml_backend_cuda_free_buffer;
|
||||||
buffer->backend_data = data;
|
buffer->backend_data = data;
|
||||||
|
|
||||||
|
4
ggml.c
4
ggml.c
@ -4468,7 +4468,7 @@ size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_backend * ggml_get_ctx_backend(struct ggml_context * ctx) {
|
struct ggml_backend * ggml_get_ctx_backend(struct ggml_context * ctx) {
|
||||||
return ctx->buffer->backend;
|
return ctx->buffer->backend_buffer->backend;
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
@ -4520,7 +4520,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
|||||||
ggml_assert_aligned(result);
|
ggml_assert_aligned(result);
|
||||||
|
|
||||||
*result = (struct ggml_tensor) {
|
*result = (struct ggml_tensor) {
|
||||||
/*.backend =*/ ctx->buffer->backend,
|
/*.backend =*/ ggml_get_ctx_backend(ctx),
|
||||||
/*.type =*/ type,
|
/*.type =*/ type,
|
||||||
/*.n_dims =*/ n_dims,
|
/*.n_dims =*/ n_dims,
|
||||||
/*.ne =*/ { 1, 1, 1, 1 },
|
/*.ne =*/ { 1, 1, 1, 1 },
|
||||||
|
72
llama.cpp
72
llama.cpp
@ -113,20 +113,6 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF() {
|
|||||||
return k_sizes;
|
return k_sizes;
|
||||||
}
|
}
|
||||||
|
|
||||||
// this is mostly needed for temporary mul_mat buffers to dequantize the data
|
|
||||||
// not actually needed if BLAS is disabled
|
|
||||||
static const std::map<e_model, size_t> & MEM_REQ_EVAL() {
|
|
||||||
static std::map<e_model, size_t> k_sizes = {
|
|
||||||
{ MODEL_3B, 512ull * MB },
|
|
||||||
//{ MODEL_7B, 768ull * MB }, // FIXME: increased until improved memory management
|
|
||||||
{ MODEL_7B, 2048ull * MB },
|
|
||||||
{ MODEL_13B, 1024ull * MB },
|
|
||||||
{ MODEL_30B, 1280ull * MB },
|
|
||||||
{ MODEL_65B, 1536ull * MB },
|
|
||||||
};
|
|
||||||
return k_sizes;
|
|
||||||
}
|
|
||||||
|
|
||||||
// default hparams (LLaMA 7B)
|
// default hparams (LLaMA 7B)
|
||||||
struct llama_hparams {
|
struct llama_hparams {
|
||||||
uint32_t n_vocab = 32000;
|
uint32_t n_vocab = 32000;
|
||||||
@ -1099,8 +1085,7 @@ static void llama_model_load_internal(
|
|||||||
ctx_sum += it.second;
|
ctx_sum += it.second;
|
||||||
}
|
}
|
||||||
|
|
||||||
const size_t mem_required =
|
const size_t mem_required = ctx_sum;
|
||||||
ctx_sum + MEM_REQ_EVAL().at(model.type);
|
|
||||||
|
|
||||||
// this is the memory required by one llama_state
|
// this is the memory required by one llama_state
|
||||||
const size_t mem_required_state =
|
const size_t mem_required_state =
|
||||||
@ -1191,7 +1176,8 @@ static ggml_graph_splits llama_build_graph(
|
|||||||
struct ggml_context * ctx_i = nullptr;
|
struct ggml_context * ctx_i = nullptr;
|
||||||
struct ggml_context * ctx_o = nullptr;
|
struct ggml_context * ctx_o = nullptr;
|
||||||
struct ggml_context * ctx_kv = nullptr;
|
struct ggml_context * ctx_kv = nullptr;
|
||||||
// TODO: reuse vectors to avoid allocations
|
|
||||||
|
// TODO: reuse these vectors to avoid allocations during eval
|
||||||
std::vector<ggml_context *> ctx_ls(n_layer);
|
std::vector<ggml_context *> ctx_ls(n_layer);
|
||||||
std::vector<struct ggml_context *> ctxs;
|
std::vector<struct ggml_context *> ctxs;
|
||||||
|
|
||||||
@ -1212,10 +1198,17 @@ static ggml_graph_splits llama_build_graph(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool measuring = lctx.bufs_compute[0]->backend_buffer->measure;
|
||||||
|
|
||||||
struct ggml_tensor * inpL;
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
// reuse the scale tensor for all layers since it requires a memory transfer
|
// reuse the scale tensor for all layers since it requires a memory transfer
|
||||||
struct ggml_tensor * KQ_scale = ggml_new_f32(ctx_kv, 1.0f/sqrtf(float(n_embd)/n_head));
|
//struct ggml_tensor * KQ_scale = ggml_new_f32(ctx_kv, 1.0f/sqrtf(float(n_embd)/n_head));
|
||||||
|
// TODO: this shouldn't be necessary
|
||||||
|
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx_kv, GGML_TYPE_F32, 1);
|
||||||
|
if (!measuring) {
|
||||||
|
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
||||||
|
}
|
||||||
ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)");
|
ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)");
|
||||||
|
|
||||||
if (embeddings_input) {
|
if (embeddings_input) {
|
||||||
@ -1459,6 +1452,8 @@ static ggml_graph_splits llama_build_graph(
|
|||||||
}
|
}
|
||||||
|
|
||||||
ggml_graph_splits_build_forward(&splits, cur);
|
ggml_graph_splits_build_forward(&splits, cur);
|
||||||
|
// TODO: this probably should be automatic on ggml_graph_splits_build_forward (and ggml_build_forward)
|
||||||
|
ggml_graph_splits_allocate_tensors(&splits);
|
||||||
|
|
||||||
// plot the computation graph in dot format (for debugging purposes)
|
// plot the computation graph in dot format (for debugging purposes)
|
||||||
//if (n_past%100 == 0) {
|
//if (n_past%100 == 0) {
|
||||||
@ -2621,17 +2616,6 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
ctx->embedding.resize(hparams.n_embd);
|
ctx->embedding.resize(hparams.n_embd);
|
||||||
}
|
}
|
||||||
|
|
||||||
// initialize compute buffers
|
|
||||||
// TODO: size the buffers more accurately - depends on improved memory management
|
|
||||||
// TODO: skip if no cpu layers
|
|
||||||
for (auto & backend_data : model->backends) {
|
|
||||||
ggml_buffer * buf_compute = ggml_buffer_alloc(backend_data.backend, MEM_REQ_EVAL().at(ctx->model.type), 2048);
|
|
||||||
ctx->bufs_compute.push_back(buf_compute);
|
|
||||||
}
|
|
||||||
// TODO: pinned memory for faster host-device transfers
|
|
||||||
//ggml_cuda_host_register(*(void**)ctx->buf_compute_cpu.backend_buffer, MEM_REQ_EVAL().at(ctx->model.type) + 128*2048);
|
|
||||||
|
|
||||||
|
|
||||||
// initialize the graph input/output buffers
|
// initialize the graph input/output buffers
|
||||||
// input buffer
|
// input buffer
|
||||||
{
|
{
|
||||||
@ -2679,6 +2663,36 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
ggml_free(ctx0);
|
ggml_free(ctx0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// initialize compute buffers
|
||||||
|
// calculate the required memory size
|
||||||
|
|
||||||
|
// create dummy compute buffers - not great, but we need backend-specific buffers to account for their requirements (e.g. alignment)
|
||||||
|
for (auto & backend_data : model->backends) {
|
||||||
|
ggml_buffer * buf_compute = ggml_buffer_measure_alloc(backend_data.backend, 2048);
|
||||||
|
ctx->bufs_compute.push_back(buf_compute);
|
||||||
|
}
|
||||||
|
// build worst-case graph
|
||||||
|
int n_tokens = std::min((int)hparams.n_ctx, params.n_batch);
|
||||||
|
int n_past = hparams.n_ctx - n_tokens;
|
||||||
|
/*ggml_graph_splits splits =*/ llama_build_graph(*ctx, n_tokens, n_past);
|
||||||
|
|
||||||
|
fprintf(stderr, "%s: compute ctx sizes:\n", __func__);
|
||||||
|
for (size_t i = 0; i < ctx->bufs_compute.size(); ++i) {
|
||||||
|
ggml_buffer * buf = ctx->bufs_compute[i];
|
||||||
|
ggml_backend * backend = buf->backend_buffer->backend;
|
||||||
|
size_t size = buf->backend_buffer->max_size;
|
||||||
|
fprintf(stderr, "%8s = %7.2f MB\n", ggml_backend_name(backend), size / 1024.0 / 1024.0);
|
||||||
|
ggml_buffer_free(buf);
|
||||||
|
|
||||||
|
// reallocate with the correct size
|
||||||
|
buf = ggml_buffer_alloc(buf->backend_buffer->backend, size, 2048);
|
||||||
|
ctx->bufs_compute[i] = buf;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: use pinned memory for faster host-device transfers
|
||||||
|
//ggml_cuda_host_register(*(void**)ctx->buf_compute_cpu.backend_buffer, MEM_REQ_EVAL().at(ctx->model.type) + 128*2048);
|
||||||
|
|
||||||
|
|
||||||
// resized during inference
|
// resized during inference
|
||||||
if (params.logits_all) {
|
if (params.logits_all) {
|
||||||
ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
|
ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
|
||||||
|
Loading…
Reference in New Issue
Block a user