mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-12 05:17:21 +01:00
metal : enable ggml-alloc (#2627)
* metal: enable ggml-alloc Make ggml-alloc work with concurrently dispatch. * style-fix Co-authored-by: slaren <slarengh@gmail.com> --------- Co-authored-by: slaren <slarengh@gmail.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
bf83bff674
commit
fc8ef549e5
25
ggml-alloc.c
25
ggml-alloc.c
@ -67,6 +67,8 @@ struct ggml_allocr {
|
|||||||
struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
|
struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
|
||||||
size_t max_size;
|
size_t max_size;
|
||||||
bool measure;
|
bool measure;
|
||||||
|
int parse_seq[GGML_MAX_NODES];
|
||||||
|
bool has_parse_seq;
|
||||||
|
|
||||||
#ifdef GGML_ALLOCATOR_DEBUG
|
#ifdef GGML_ALLOCATOR_DEBUG
|
||||||
struct ggml_tensor * allocated_tensors[1024];
|
struct ggml_tensor * allocated_tensors[1024];
|
||||||
@ -229,6 +231,17 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t
|
|||||||
alloc->n_free_blocks++;
|
alloc->n_free_blocks++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n) {
|
||||||
|
int pos = 0;
|
||||||
|
for (int i = 0; i < n; i++) {
|
||||||
|
if (list[i] != -1) {
|
||||||
|
alloc->parse_seq[pos] = list[i];
|
||||||
|
pos++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
alloc->has_parse_seq = true;
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_allocr_reset(struct ggml_allocr * alloc) {
|
void ggml_allocr_reset(struct ggml_allocr * alloc) {
|
||||||
alloc->n_free_blocks = 1;
|
alloc->n_free_blocks = 1;
|
||||||
size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
|
size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
|
||||||
@ -248,6 +261,8 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
|
|||||||
/*.hash_table = */ {{0}},
|
/*.hash_table = */ {{0}},
|
||||||
/*.max_size = */ 0,
|
/*.max_size = */ 0,
|
||||||
/*.measure = */ false,
|
/*.measure = */ false,
|
||||||
|
/*.parse_seq = */ {0},
|
||||||
|
/*.has_parse_seq = */ false,
|
||||||
#ifdef GGML_ALLOCATOR_DEBUG
|
#ifdef GGML_ALLOCATOR_DEBUG
|
||||||
/*.allocated_tensors = */ = {0},
|
/*.allocated_tensors = */ = {0},
|
||||||
#endif
|
#endif
|
||||||
@ -275,6 +290,8 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
|||||||
/*.hash_table = */ {{0}},
|
/*.hash_table = */ {{0}},
|
||||||
/*.max_size = */ 0,
|
/*.max_size = */ 0,
|
||||||
/*.measure = */ true,
|
/*.measure = */ true,
|
||||||
|
/*.parse_seq = */ {0},
|
||||||
|
/*.has_parse_seq = */ false,
|
||||||
#ifdef GGML_ALLOCATOR_DEBUG
|
#ifdef GGML_ALLOCATOR_DEBUG
|
||||||
/*.allocated_tensors = */ = {0},
|
/*.allocated_tensors = */ = {0},
|
||||||
#endif
|
#endif
|
||||||
@ -473,7 +490,13 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|||||||
allocate_node(alloc, input);
|
allocate_node(alloc, input);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (int i = 0; i < gf->n_nodes; i++) {
|
for (int ind = 0; ind < gf->n_nodes; ind++) {
|
||||||
|
int i;
|
||||||
|
if (alloc->has_parse_seq) {
|
||||||
|
i = alloc->parse_seq[ind];
|
||||||
|
} else {
|
||||||
|
i = ind;
|
||||||
|
}
|
||||||
struct ggml_tensor * node = gf->nodes[i];
|
struct ggml_tensor * node = gf->nodes[i];
|
||||||
|
|
||||||
// allocate parents (leafs)
|
// allocate parents (leafs)
|
||||||
|
@ -10,6 +10,10 @@ extern "C" {
|
|||||||
GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
|
GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
|
||||||
GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
|
GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
|
||||||
|
|
||||||
|
// tell the allocator to parse nodes following the order described in the list
|
||||||
|
// you should call this if your graph are optimized to execute out-of-order
|
||||||
|
GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n);
|
||||||
|
|
||||||
GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
|
GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
|
||||||
GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
|
GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
|
||||||
GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc);
|
GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc);
|
||||||
|
@ -63,10 +63,13 @@ void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor *
|
|||||||
|
|
||||||
// try to find operations that can be run concurrently in the graph
|
// try to find operations that can be run concurrently in the graph
|
||||||
// you should run it again if the topology of your graph changes
|
// you should run it again if the topology of your graph changes
|
||||||
void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
|
void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf, bool check_mem);
|
||||||
|
|
||||||
// if the graph has been optimized for concurrently dispatch
|
// if the graph has been optimized for concurrently dispatch, return length of the concur_list if optimized
|
||||||
bool ggml_metal_if_optimized(struct ggml_metal_context * ctx);
|
int ggml_metal_if_optimized(struct ggml_metal_context * ctx);
|
||||||
|
|
||||||
|
// output the concur_list for ggml_alloc
|
||||||
|
int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
|
||||||
|
|
||||||
// same as ggml_graph_compute but uses Metal
|
// same as ggml_graph_compute but uses Metal
|
||||||
// creates gf->n_threads command buffers in parallel
|
// creates gf->n_threads command buffers in parallel
|
||||||
|
15
ggml-metal.m
15
ggml-metal.m
@ -236,11 +236,12 @@ void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
|
|||||||
ctx->n_cb = n_cb;
|
ctx->n_cb = n_cb;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_metal_if_optimized(struct ggml_metal_context * ctx) {
|
int ggml_metal_if_optimized(struct ggml_metal_context * ctx) {
|
||||||
if (ctx->concur_list_len) {
|
return ctx->concur_list_len;
|
||||||
return true;
|
}
|
||||||
}
|
|
||||||
return false;
|
int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) {
|
||||||
|
return ctx->concur_list;
|
||||||
}
|
}
|
||||||
|
|
||||||
// finds the Metal buffer that contains the tensor data on the GPU device
|
// finds the Metal buffer that contains the tensor data on the GPU device
|
||||||
@ -383,7 +384,7 @@ void ggml_metal_get_tensor(
|
|||||||
|
|
||||||
void ggml_metal_graph_find_concurrency(
|
void ggml_metal_graph_find_concurrency(
|
||||||
struct ggml_metal_context * ctx,
|
struct ggml_metal_context * ctx,
|
||||||
struct ggml_cgraph * gf) {
|
struct ggml_cgraph * gf, bool check_mem) {
|
||||||
int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time
|
int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time
|
||||||
int nodes_unused[GGML_MAX_CONCUR];
|
int nodes_unused[GGML_MAX_CONCUR];
|
||||||
|
|
||||||
@ -430,7 +431,7 @@ void ggml_metal_graph_find_concurrency(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (exe_flag) {
|
if (exe_flag && check_mem) {
|
||||||
// check if nodes[i]'s data will be overwritten by a node before nodes[i].
|
// check if nodes[i]'s data will be overwritten by a node before nodes[i].
|
||||||
// if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3]
|
// if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3]
|
||||||
int64_t data_start = (int64_t) gf->nodes[i]->data;
|
int64_t data_start = (int64_t) gf->nodes[i]->data;
|
||||||
|
34
llama.cpp
34
llama.cpp
@ -63,7 +63,7 @@ static void llama_log_callback_default(llama_log_level level, const char * text,
|
|||||||
#define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__)
|
#define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__)
|
||||||
|
|
||||||
|
|
||||||
#if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL)
|
#if !defined(GGML_USE_CUBLAS)
|
||||||
#include "ggml-alloc.h"
|
#include "ggml-alloc.h"
|
||||||
#define LLAMA_USE_ALLOCATOR
|
#define LLAMA_USE_ALLOCATOR
|
||||||
#else
|
#else
|
||||||
@ -1846,10 +1846,6 @@ static bool llama_eval_internal(
|
|||||||
|
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
if (lctx.ctx_metal) {
|
if (lctx.ctx_metal) {
|
||||||
// TODO: disabled until #2413 is resolved
|
|
||||||
//if (!ggml_metal_if_optimized(lctx.ctx_metal)) {
|
|
||||||
// ggml_metal_graph_find_concurrency(lctx.ctx_metal, gf);
|
|
||||||
//}
|
|
||||||
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
||||||
ggml_metal_graph_compute(lctx.ctx_metal, gf);
|
ggml_metal_graph_compute(lctx.ctx_metal, gf);
|
||||||
ggml_metal_get_tensor (lctx.ctx_metal, res);
|
ggml_metal_get_tensor (lctx.ctx_metal, res);
|
||||||
@ -3287,7 +3283,18 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
int n_past = hparams.n_ctx - n_tokens;
|
int n_past = hparams.n_ctx - n_tokens;
|
||||||
llama_token token = llama_token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
llama_token token = llama_token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
||||||
ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past);
|
ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past);
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
if (params.n_gpu_layers > 0) {
|
||||||
|
ctx->ctx_metal = ggml_metal_init(1);
|
||||||
|
if (!ctx->ctx_metal) {
|
||||||
|
LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
|
||||||
|
llama_free(ctx);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
|
||||||
|
ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
// measure memory requirements for the graph
|
// measure memory requirements for the graph
|
||||||
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
|
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
|
||||||
|
|
||||||
@ -3305,6 +3312,11 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
|
|
||||||
ctx->buf_alloc.resize(alloc_size);
|
ctx->buf_alloc.resize(alloc_size);
|
||||||
ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment);
|
ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment);
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
if (ctx->ctx_metal) {
|
||||||
|
ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
|
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
|
||||||
@ -3319,13 +3331,6 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
if (params.n_gpu_layers > 0) {
|
if (params.n_gpu_layers > 0) {
|
||||||
// this allocates all Metal resources and memory buffers
|
// this allocates all Metal resources and memory buffers
|
||||||
ctx->ctx_metal = ggml_metal_init(1);
|
|
||||||
|
|
||||||
if (!ctx->ctx_metal) {
|
|
||||||
LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
|
|
||||||
llama_free(ctx);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
void * data_ptr = NULL;
|
void * data_ptr = NULL;
|
||||||
size_t data_size = 0;
|
size_t data_size = 0;
|
||||||
@ -3354,8 +3359,7 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
|
||||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0));
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0));
|
||||||
|
|
||||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.addr, ctx->buf_alloc.size, 0));
|
||||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
|
|
||||||
#undef LLAMA_METAL_CHECK_BUF
|
#undef LLAMA_METAL_CHECK_BUF
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
Loading…
x
Reference in New Issue
Block a user