ggml : remove ggml_cplan + rework ggml_cgraph

ggml-ci
This commit is contained in:
Georgi Gerganov 2024-09-11 13:05:10 +03:00
parent ee154457dd
commit 119e0bc9ae
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
10 changed files with 248 additions and 175 deletions

View File

@ -17,17 +17,6 @@ constexpr float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
constexpr float rms_norm_eps = 5e-6f; constexpr float rms_norm_eps = 5e-6f;
#endif #endif
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
if (plan.work_size > 0) {
buf.resize(plan.work_size);
plan.work_data = buf.data();
}
ggml_graph_compute(graph, &plan);
}
static struct ggml_tensor * randomize_tensor( static struct ggml_tensor * randomize_tensor(
struct ggml_tensor * tensor, int ndims, const int64_t ne[], float fmin, float fmax struct ggml_tensor * tensor, int ndims, const int64_t ne[], float fmin, float fmax
) { ) {
@ -1514,8 +1503,6 @@ int main(int argc, char ** argv) {
int n_tokens = model.hparams.n_ctx; int n_tokens = model.hparams.n_ctx;
int n_vocab = model.hparams.n_vocab; int n_vocab = model.hparams.n_vocab;
std::vector<uint8_t> work_buffer;
for (int ex=0; ex<n_examples; ++ex) { for (int ex=0; ex<n_examples; ++ex) {
struct ggml_init_params params = { struct ggml_init_params params = {
/*.mem_size =*/ compute_size, /*.mem_size =*/ compute_size,
@ -1542,7 +1529,10 @@ int main(int argc, char ** argv) {
struct ggml_tensor * e = square_error_loss(ctx0, targets, logits); struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);
ggml_build_forward_expand(gf, e); ggml_build_forward_expand(gf, e);
ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1); ggml_graph_prepare(gf, 1, nullptr);
ggml_graph_work_init(gf, nullptr);
ggml_graph_compute(gf);
ggml_graph_work_free(gf);
float error_before_opt = ggml_get_f32_1d(e, 0); float error_before_opt = ggml_get_f32_1d(e, 0);
@ -1553,7 +1543,10 @@ int main(int argc, char ** argv) {
ggml_opt(ctx0, opt_params_lbfgs, e); ggml_opt(ctx0, opt_params_lbfgs, e);
// //
ggml_build_forward_expand(gf, e); ggml_build_forward_expand(gf, e);
ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1); ggml_graph_prepare(gf, 1, nullptr);
ggml_graph_work_init(gf, nullptr);
ggml_graph_compute(gf);
ggml_graph_work_free(gf);
float error_after_opt = ggml_get_f32_1d(e, 0); float error_after_opt = ggml_get_f32_1d(e, 0);
@ -1607,7 +1600,10 @@ int main(int argc, char ** argv) {
struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, gf, tokens_input, sample_ctx, n_past); struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, gf, tokens_input, sample_ctx, n_past);
ggml_build_forward_expand(gf, logits); ggml_build_forward_expand(gf, logits);
ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1); ggml_graph_prepare(gf, 1, nullptr);
ggml_graph_work_init(gf, nullptr);
ggml_graph_compute(gf);
ggml_graph_work_free(gf);
struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx); struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx); struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);

View File

@ -20,17 +20,6 @@
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
#endif #endif
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
if (plan.work_size > 0) {
buf.resize(plan.work_size);
plan.work_data = buf.data();
}
ggml_graph_compute(graph, &plan);
}
static float tensor_sum_elements(const ggml_tensor * tensor) { static float tensor_sum_elements(const ggml_tensor * tensor) {
double sum = 0; double sum = 0;
if (tensor->type == GGML_TYPE_F32) { if (tensor->type == GGML_TYPE_F32) {
@ -179,9 +168,8 @@ int main(int argc, char ** argv) {
TENSOR_DUMP(m11); TENSOR_DUMP(m11);
TENSOR_DUMP(m2); TENSOR_DUMP(m2);
std::vector<uint8_t> work_buffer; ggml_graph_prepare(gf, benchmark_params.n_threads, nullptr);
ggml_graph_work_init(gf, nullptr);
ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);
TENSOR_DUMP(ggml_graph_node(gf, 0)); TENSOR_DUMP(ggml_graph_node(gf, 0));
@ -234,7 +222,7 @@ int main(int argc, char ** argv) {
long long int start = ggml_time_us(); long long int start = ggml_time_us();
//printf("Running ggml_graph_compute\n"); //printf("Running ggml_graph_compute\n");
ggml_graph_compute_helper(work_buffer, gf31, benchmark_params.n_threads); ggml_graph_compute(gf31);
long long int stop = ggml_time_us(); long long int stop = ggml_time_us();
long long int usec = stop-start; long long int usec = stop-start;
@ -267,8 +255,11 @@ int main(int argc, char ** argv) {
} }
// Running a different graph computation to make sure we override the CPU cache lines // Running a different graph computation to make sure we override the CPU cache lines
ggml_graph_compute_helper(work_buffer, gf32, benchmark_params.n_threads); ggml_graph_compute(gf32);
} }
ggml_graph_work_free(gf);
printf("\n"); printf("\n");
printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations)); printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
printf("=====================================================================================\n"); printf("=====================================================================================\n");

View File

@ -183,7 +183,9 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
struct ggml_tensor *flatten = ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip), num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, size_ele * clip_n_mmproj_embd(ctx_clip), 0); struct ggml_tensor *flatten = ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip), num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, size_ele * clip_n_mmproj_embd(ctx_clip), 0);
// ggml_tensor_printf(flatten,"flatten",__LINE__,false,false); // ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
ggml_build_forward_expand(gf, flatten); ggml_build_forward_expand(gf, flatten);
ggml_graph_compute_with_ctx(model.ctx, gf, 1); ggml_graph_prepare(gf, 1, nullptr);
ggml_graph_work_init(gf, model.ctx);
ggml_graph_compute(gf);
struct ggml_tensor* result = ggml_graph_node(gf, -1); struct ggml_tensor* result = ggml_graph_node(gf, -1);
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context

View File

@ -644,20 +644,6 @@ extern "C" {
typedef struct ggml_threadpool * ggml_threadpool_t; typedef struct ggml_threadpool * ggml_threadpool_t;
// the compute plan that needs to be prepared for ggml_graph_compute()
// since https://github.com/ggerganov/ggml/issues/287
struct ggml_cplan {
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
int n_threads;
struct ggml_threadpool * threadpool;
// abort ggml_graph_compute when true
ggml_abort_callback abort_callback;
void * abort_callback_data;
};
// scratch buffer // scratch buffer
struct ggml_scratch { struct ggml_scratch {
size_t offs; size_t offs;
@ -2047,7 +2033,6 @@ extern "C" {
GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep); GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
// graph allocation in a context
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads); GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph); GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
@ -2065,26 +2050,72 @@ extern "C" {
GGML_API size_t ggml_graph_overhead(void); GGML_API size_t ggml_graph_overhead(void);
GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads); GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
// TODO: move these declarations above before the ggml_graph API and reorder the implementation order in ggml.c
// (unless the code has been moved to a separate source file)
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads); GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads); GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1); GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
GGML_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params); GGML_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool); GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool); GGML_API int ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool);
GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool); GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool); GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
// ggml_graph_plan() has to be called before ggml_graph_compute() // =================================================================================================
// when plan.work_size > 0, caller must allocate memory for plan.work_data // CPU-only API for ggml_cgraph
GGML_API struct ggml_cplan ggml_graph_plan( //
const struct ggml_cgraph * cgraph, // TODO: move as a separate backend
// NOTE: avoid using, will be removed
//
// loops through the graph and determines:
//
// - work size needed for CPU computation
// - number of threads to start
//
GGML_API enum ggml_status ggml_graph_prepare(
struct ggml_cgraph * cgraph,
int n_threads, /* = GGML_DEFAULT_N_THREADS */ int n_threads, /* = GGML_DEFAULT_N_THREADS */
struct ggml_threadpool * threadpool /* = NULL */ ); struct ggml_threadpool * threadpool /* = NULL */ );
GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
// same as ggml_graph_compute() but the work data is allocated as a part of the context // get the estimated work size for the graph from ggml_graph_prepare()
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data GGML_API size_t ggml_graph_work_size(const struct ggml_cgraph * cgraph);
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
// if ctx is NULL, the work buffer will be dynamically allocated. in this case, call ggml_graph_work_free() to free the buffer
// otherwise, the work buffer will be allocated in the context. no need to free it
GGML_API enum ggml_status ggml_graph_work_init(struct ggml_cgraph * cgraph, struct ggml_context * ctx);
GGML_API void ggml_graph_work_free(struct ggml_cgraph * cgraph);
// note: call ggml_graph_prepare() and ggml_graph_work_init() first
//
// sample usages:
//
// - no dynamic allocations:
//
// ... prepare ggml_context ctx ...
//
// ggml_graph_prepare (cgraph, n_threads, threadpool);
// ggml_graph_work_init(cgraph, ctx);
//
// ggml_graph_compute (cgraph); // can call many times
//
// // no need to call ggml_graph_work_free() because it is allocated in ctx
//
// - dynamic allocations:
//
// ggml_graph_prepare (cgraph, n_threads, threadpool);
// ggml_graph_work_init(cgraph, NULL); // will allocate memory
//
// ggml_graph_compute (cgraph); // can call many times
//
// ggml_graph_work_free(cgraph);
//
GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph);
// end of CPU-only API
// =================================================================================================
GGML_API void ggml_graph_set_abort_callback(struct ggml_cgraph * cgraph, ggml_abort_callback abort_callback, void * abort_data);
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name); GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
@ -2107,6 +2138,7 @@ extern "C" {
struct ggml_cgraph * gb_tmp, struct ggml_cgraph * gb_tmp,
struct ggml_tensor * * checkpoints, struct ggml_tensor * * checkpoints,
int n_checkpoints); int n_checkpoints);
// //
// optimization // optimization
// //

View File

@ -751,8 +751,10 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_
GGML_UNUSED(backend); GGML_UNUSED(backend);
} }
// TODO: this struct should no longer be needed
// instead, the new ggml_graph_work_init() + ggml_graph_work_free() API should be enough to replace this
// for now, keeping the implementation as it is, to avoid making a mistake
struct ggml_backend_plan_cpu { struct ggml_backend_plan_cpu {
struct ggml_cplan cplan;
struct ggml_cgraph cgraph; struct ggml_cgraph cgraph;
}; };
@ -761,19 +763,19 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu)); struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
cpu_plan->cgraph = *cgraph; // FIXME: deep copy cpu_plan->cgraph = *cgraph; // FIXME: deep copy
ggml_graph_prepare(&cpu_plan->cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
if (cpu_plan->cplan.work_size > 0) { if (cpu_plan->cgraph.work_size > 0) {
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size); cpu_plan->cgraph.work_data = malloc(cpu_plan->cgraph.work_size);
if (cpu_plan->cplan.work_data == NULL) { if (cpu_plan->cgraph.work_data == NULL) {
free(cpu_plan); free(cpu_plan);
return NULL; return NULL;
} }
} }
cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback; cpu_plan->cgraph.abort_callback = cpu_ctx->abort_callback;
cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data; cpu_plan->cgraph.abort_callback_data = cpu_ctx->abort_callback_data;
return cpu_plan; return cpu_plan;
} }
@ -781,7 +783,7 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan; struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
free(cpu_plan->cplan.work_data); free(cpu_plan->cgraph.work_data);
free(cpu_plan); free(cpu_plan);
GGML_UNUSED(backend); GGML_UNUSED(backend);
@ -790,7 +792,7 @@ GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, g
GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan; struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan); return ggml_graph_compute(&cpu_plan->cgraph);
GGML_UNUSED(backend); GGML_UNUSED(backend);
} }
@ -798,23 +800,24 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backe
GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool); ggml_graph_prepare(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
if (cpu_ctx->work_size < cplan.work_size) { if (cpu_ctx->work_size < cgraph->work_size) {
free(cpu_ctx->work_data); free(cpu_ctx->work_data);
cpu_ctx->work_data = malloc(cplan.work_size); cpu_ctx->work_data = malloc(cgraph->work_size);
if (cpu_ctx->work_data == NULL) { if (cpu_ctx->work_data == NULL) {
cpu_ctx->work_size = 0; cpu_ctx->work_size = 0;
return GGML_STATUS_ALLOC_FAILED; return GGML_STATUS_ALLOC_FAILED;
} }
cpu_ctx->work_size = cplan.work_size; cpu_ctx->work_size = cgraph->work_size;
} }
cplan.work_data = cpu_ctx->work_data; cgraph->work_data = cpu_ctx->work_data;
cgraph->work_own = false; // always freed by ggml_backend_cpu_graph_plan_free
cplan.abort_callback = cpu_ctx->abort_callback; cgraph->abort_callback = cpu_ctx->abort_callback;
cplan.abort_callback_data = cpu_ctx->abort_callback_data; cgraph->abort_callback_data = cpu_ctx->abort_callback_data;
return ggml_graph_compute(cgraph, &cplan); return ggml_graph_compute(cgraph);
} }
GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {

View File

@ -773,6 +773,17 @@ struct ggml_cgraph {
struct ggml_hash_set visited_hash_set; struct ggml_hash_set visited_hash_set;
enum ggml_cgraph_eval_order order; enum ggml_cgraph_eval_order order;
bool work_own;
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
int n_threads;
struct ggml_threadpool * threadpool;
// abort ggml_graph_compute when true
ggml_abort_callback abort_callback;
void * abort_callback_data;
}; };
struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1); struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);

View File

@ -2001,7 +2001,6 @@ struct ggml_threadpool {
ggml_cond_t cond; // cond.var for waiting for new work ggml_cond_t cond; // cond.var for waiting for new work
struct ggml_cgraph * cgraph; struct ggml_cgraph * cgraph;
struct ggml_cplan * cplan;
// synchronization primitives // synchronization primitives
atomic_int n_graph; // incremented when there is work to be done (i.e each graph) atomic_int n_graph; // incremented when there is work to be done (i.e each graph)
@ -19095,8 +19094,15 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
/*.nodes =*/ nodes_ptr, /*.nodes =*/ nodes_ptr,
/*.grads =*/ grads_ptr, /*.grads =*/ grads_ptr,
/*.leafs =*/ leafs_ptr, /*.leafs =*/ leafs_ptr,
/*.hash_table =*/ { hash_size, hash_used, hash_keys_ptr }, /*.visited_hash_set =*/ { hash_size, hash_used, hash_keys_ptr },
/*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT, /*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
/*.work_own =*/ false,
/*.work_size =*/ 0,
/*.work_data =*/ NULL,
/*.n_threads =*/ GGML_DEFAULT_N_THREADS,
/*.threadpool =*/ NULL,
/*.abort_callback =*/ NULL,
/*.abort_callback_data =*/ NULL,
}; };
ggml_hash_set_reset(&cgraph->visited_hash_set); ggml_hash_set_reset(&cgraph->visited_hash_set);
@ -19118,6 +19124,13 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1)
/*.leafs =*/ NULL, /*.leafs =*/ NULL,
/*.hash_table =*/ { 0, NULL, NULL }, /*.hash_table =*/ { 0, NULL, NULL },
/*.order =*/ cgraph0->order, /*.order =*/ cgraph0->order,
/*.work_own =*/ false,
/*.work_size =*/ 0,
/*.work_data =*/ NULL,
/*.n_threads =*/ GGML_DEFAULT_N_THREADS,
/*.threadpool =*/ NULL,
/*.abort_callback =*/ NULL,
/*.abort_callback_data =*/ NULL,
}; };
return cgraph; return cgraph;
@ -19753,11 +19766,10 @@ void ggml_threadpool_resume(struct ggml_threadpool * threadpool) {
#endif #endif
} }
struct ggml_cplan ggml_graph_plan( enum ggml_status ggml_graph_prepare(
const struct ggml_cgraph * cgraph, struct ggml_cgraph * cgraph,
int n_threads, int n_threads,
struct ggml_threadpool * threadpool) { struct ggml_threadpool * threadpool) {
if (threadpool == NULL) { if (threadpool == NULL) {
GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads); GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
} }
@ -19767,9 +19779,6 @@ struct ggml_cplan ggml_graph_plan(
size_t work_size = 0; size_t work_size = 0;
struct ggml_cplan cplan;
memset(&cplan, 0, sizeof(struct ggml_cplan));
int max_tasks = 1; int max_tasks = 1;
// thread scheduling for the different operations + work buffer size estimation // thread scheduling for the different operations + work buffer size estimation
@ -19921,28 +19930,63 @@ struct ggml_cplan ggml_graph_plan(
work_size += CACHE_LINE_SIZE*(n_threads); work_size += CACHE_LINE_SIZE*(n_threads);
} }
cplan.threadpool = threadpool; cgraph->threadpool = threadpool;
cplan.n_threads = MIN(max_tasks, n_threads); cgraph->n_threads = MIN(max_tasks, n_threads);
cplan.work_size = work_size; cgraph->work_size = work_size;
cplan.work_data = NULL;
return cplan; ggml_graph_work_free(cgraph);
return GGML_STATUS_SUCCESS;
}
size_t ggml_graph_work_size(const struct ggml_cgraph * cgraph) {
return cgraph->work_size;
}
enum ggml_status ggml_graph_work_init(struct ggml_cgraph * cgraph, struct ggml_context * ctx) {
GGML_ASSERT(cgraph->n_threads > 0 && "call ggml_graph_prepare first");
ggml_graph_work_free(cgraph);
if (cgraph->work_size > 0) {
if (ctx == NULL) {
cgraph->work_data = GGML_ALIGNED_MALLOC(cgraph->work_size);
if (cgraph->work_data == NULL) {
return GGML_STATUS_ALLOC_FAILED;
}
cgraph->work_own = true;
} else {
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cgraph->work_size);
cgraph->work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
cgraph->work_own = false;
}
}
return GGML_STATUS_SUCCESS;
}
void ggml_graph_work_free(struct ggml_cgraph * cgraph) {
if (cgraph->work_data && cgraph->work_own) {
GGML_ALIGNED_FREE(cgraph->work_data);
cgraph->work_data = NULL;
}
} }
static thread_ret_t ggml_graph_compute_thread(void * data) { static thread_ret_t ggml_graph_compute_thread(void * data) {
struct ggml_compute_state * state = (struct ggml_compute_state *) data; struct ggml_compute_state * state = (struct ggml_compute_state *) data;
const struct ggml_cgraph * cgraph = state->threadpool->cgraph; const struct ggml_cgraph * cgraph = state->threadpool->cgraph;
const struct ggml_cplan * cplan = state->threadpool->cplan;
set_numa_thread_affinity(state->ith); set_numa_thread_affinity(state->ith);
struct ggml_compute_params params = { struct ggml_compute_params params = {
/*.ith =*/ state->ith, /*.ith =*/ state->ith,
/*.nth =*/ state->threadpool->n_threads_cur, /*.nth =*/ state->threadpool->n_threads_cur,
/*.wsize =*/ cplan->work_size, /*.wsize =*/ cgraph->work_size,
/*.wdata =*/ cplan->work_data, /*.wdata =*/ cgraph->work_data,
/*.threadpool=*/ state->threadpool, /*.threadpool =*/ state->threadpool,
}; };
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
@ -19950,7 +19994,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
ggml_compute_forward(&params, node); ggml_compute_forward(&params, node);
if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { if (state->ith == 0 && cgraph->abort_callback && cgraph->abort_callback(cgraph->abort_callback_data)) {
state->threadpool->ec = GGML_STATUS_ABORTED; state->threadpool->ec = GGML_STATUS_ABORTED;
} }
@ -20104,14 +20148,12 @@ bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, cons
static struct ggml_threadpool * ggml_threadpool_new_impl( static struct ggml_threadpool * ggml_threadpool_new_impl(
struct ggml_threadpool_params * tpp, struct ggml_threadpool_params * tpp,
struct ggml_cgraph * cgraph, struct ggml_cgraph * cgraph) {
struct ggml_cplan * cplan) {
struct ggml_threadpool * threadpool = struct ggml_threadpool * threadpool =
GGML_ALIGNED_MALLOC(sizeof(struct ggml_threadpool)); GGML_ALIGNED_MALLOC(sizeof(struct ggml_threadpool));
{ {
threadpool->cgraph = cgraph; threadpool->cgraph = cgraph;
threadpool->cplan = cplan;
threadpool->n_graph = 0; threadpool->n_graph = 0;
threadpool->n_barrier = 0; threadpool->n_barrier = 0;
threadpool->n_barrier_passed = 0; threadpool->n_barrier_passed = 0;
@ -20169,16 +20211,15 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
} }
struct ggml_threadpool * ggml_threadpool_new(struct ggml_threadpool_params * tpp) { struct ggml_threadpool * ggml_threadpool_new(struct ggml_threadpool_params * tpp) {
return ggml_threadpool_new_impl(tpp, NULL, NULL); return ggml_threadpool_new_impl(tpp, NULL);
} }
enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph) {
GGML_ASSERT(cplan); GGML_ASSERT((cgraph->n_threads > 0 ) && "call ggml_graph_prepare first");
GGML_ASSERT(cplan->n_threads > 0); GGML_ASSERT((cgraph->work_size == 0 || cgraph->work_data != NULL) && "call ggml_graph_work_init first");
GGML_ASSERT(cplan->work_size == 0 || cplan->work_data != NULL);
int n_threads = cplan->n_threads; int n_threads = cgraph->n_threads;
struct ggml_threadpool * threadpool = cplan->threadpool; struct ggml_threadpool * threadpool = cgraph->threadpool;
bool disposable_threadpool = false; bool disposable_threadpool = false;
@ -20187,19 +20228,18 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
disposable_threadpool = true; disposable_threadpool = true;
struct ggml_threadpool_params ttp = ggml_threadpool_params_default(n_threads); struct ggml_threadpool_params ttp = ggml_threadpool_params_default(n_threads);
threadpool = ggml_threadpool_new_impl(&ttp, cgraph, cplan); threadpool = ggml_threadpool_new_impl(&ttp, cgraph);
} else { } else {
// Reset some of the parameters that need resetting // Reset some of the parameters that need resetting
// No worker threads should be accessing the parameters below at this stage // No worker threads should be accessing the parameters below at this stage
threadpool->cgraph = cgraph; threadpool->cgraph = cgraph;
threadpool->cplan = cplan;
threadpool->n_threads_cur = n_threads; threadpool->n_threads_cur = n_threads;
threadpool->current_chunk = 0; threadpool->current_chunk = 0;
threadpool->ec = GGML_STATUS_SUCCESS; threadpool->ec = GGML_STATUS_SUCCESS;
} }
if (n_threads > threadpool->n_threads_max) { if (n_threads > threadpool->n_threads_max) {
GGML_PRINT("WARNING: cplan is requesting more threads than the threadpool contains. Expect a bad time!\n"); GGML_PRINT("WARNING: cgraph is requesting more threads than the threadpool contains. Expect a bad time!\n");
} }
#ifdef GGML_USE_OPENMP #ifdef GGML_USE_OPENMP
@ -20238,14 +20278,9 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
return ret; return ret;
} }
enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) { void ggml_graph_set_abort_callback(struct ggml_cgraph * cgraph, ggml_abort_callback abort_callback, void * abort_data) {
struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads, NULL); cgraph->abort_callback = abort_callback;
cgraph->abort_callback_data = abort_data;
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
return ggml_graph_compute(cgraph, &cplan);
} }
struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) { struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
@ -21055,9 +21090,8 @@ static enum ggml_opt_result ggml_opt_adam(
float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads, NULL); ggml_graph_prepare (gb, params.n_threads, NULL);
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size); ggml_graph_work_init(gb, ctx);
cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
bool cancel = false; bool cancel = false;
@ -21073,7 +21107,7 @@ static enum ggml_opt_result ggml_opt_adam(
} }
// ggml_graph_reset (gf); // ggml_graph_reset (gf);
ggml_set_f32 (f->grad, 1.0f); ggml_set_f32 (f->grad, 1.0f);
ggml_graph_compute(gb, &cplan); ggml_graph_compute(gb);
ggml_opt_acc_grad(np, ps, g, accum_norm); ggml_opt_acc_grad(np, ps, g, accum_norm);
fx += ggml_get_f32_1d(f, 0); fx += ggml_get_f32_1d(f, 0);
} }
@ -21164,7 +21198,7 @@ static enum ggml_opt_result ggml_opt_adam(
} }
// ggml_graph_reset (gf); // ggml_graph_reset (gf);
ggml_set_f32 (f->grad, 1.0f); ggml_set_f32 (f->grad, 1.0f);
ggml_graph_compute(gb, &cplan); ggml_graph_compute(gb);
ggml_opt_acc_grad(np, ps, g, accum_norm); ggml_opt_acc_grad(np, ps, g, accum_norm);
fx += ggml_get_f32_1d(f, 0); fx += ggml_get_f32_1d(f, 0);
} }
@ -21249,7 +21283,6 @@ static enum ggml_opt_result linesearch_backtracking(
const float * xp, const float * xp,
struct ggml_tensor * f, struct ggml_tensor * f,
struct ggml_cgraph * gb, struct ggml_cgraph * gb,
struct ggml_cplan * cplan,
const int np, const int np,
struct ggml_tensor * ps[], struct ggml_tensor * ps[],
bool * cancel, bool * cancel,
@ -21306,7 +21339,7 @@ static enum ggml_opt_result linesearch_backtracking(
} }
// ggml_graph_reset (gf); // ggml_graph_reset (gf);
ggml_set_f32 (f->grad, 1.0f); ggml_set_f32 (f->grad, 1.0f);
ggml_graph_compute(gb, cplan); ggml_graph_compute(gb);
ggml_opt_acc_grad(np, ps, g, accum_norm); ggml_opt_acc_grad(np, ps, g, accum_norm);
*fx += ggml_get_f32_1d(f, 0); *fx += ggml_get_f32_1d(f, 0);
} }
@ -21402,9 +21435,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
opt->iter = iter; opt->iter = iter;
} }
struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads, NULL); ggml_graph_prepare (gb, params.n_threads, NULL);
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size); ggml_graph_work_init(gb, ctx);
cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
float * x = opt->lbfgs.x->data; // current parameters float * x = opt->lbfgs.x->data; // current parameters
float * xp = opt->lbfgs.xp->data; // previous parameters float * xp = opt->lbfgs.xp->data; // previous parameters
@ -21449,7 +21481,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
} }
// ggml_graph_reset (gf); // ggml_graph_reset (gf);
ggml_set_f32 (f->grad, 1.0f); ggml_set_f32 (f->grad, 1.0f);
ggml_graph_compute(gb, &cplan); ggml_graph_compute(gb);
ggml_opt_acc_grad(np, ps, g, accum_norm); ggml_opt_acc_grad(np, ps, g, accum_norm);
fx += ggml_get_f32_1d(f, 0); fx += ggml_get_f32_1d(f, 0);
} }
@ -21515,7 +21547,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
// to determine if the optimization should be cancelled // to determine if the optimization should be cancelled
// this is a simple change, but not doing this atm, since I don't have a nice // this is a simple change, but not doing this atm, since I don't have a nice
// way to test and don't want to break something with so many changes lined up // way to test and don't want to break something with so many changes lined up
ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data); ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gb, np, ps, &cancel, callback, callback_data);
if (cancel) { if (cancel) {
return GGML_OPT_RESULT_CANCEL; return GGML_OPT_RESULT_CANCEL;
} }

View File

@ -242,12 +242,16 @@ static bool check_gradient(
ggml_graph_cpy(gf, gb); ggml_graph_cpy(gf, gb);
ggml_build_backward_expand(ctx0, gf, gb, false); ggml_build_backward_expand(ctx0, gf, gb, false);
ggml_graph_compute_with_ctx(ctx0, gf, n_threads); ggml_graph_prepare(gf, n_threads, nullptr);
ggml_graph_work_init(gf, ctx0);
ggml_graph_compute(gf);
ggml_graph_reset (gf); ggml_graph_reset (gf);
ggml_set_f32 (f->grad, 1.0f); ggml_set_f32 (f->grad, 1.0f);
ggml_graph_compute_with_ctx(ctx0, gb, n_threads); ggml_graph_prepare(gb, n_threads, nullptr);
ggml_graph_work_init(gb, ctx0);
ggml_graph_compute(gb);
// ggml_graph_dump_dot(gf, NULL, "test-grad0-forward.dot"); // ggml_graph_dump_dot(gf, NULL, "test-grad0-forward.dot");
// ggml_graph_dump_dot(gb, gf, "test-grad0-backward.dot"); // ggml_graph_dump_dot(gb, gf, "test-grad0-backward.dot");
@ -262,13 +266,17 @@ static bool check_gradient(
const float xp = x0 + eps; const float xp = x0 + eps;
ggml_set_f32_1d(x[i], k, xp); ggml_set_f32_1d(x[i], k, xp);
ggml_graph_compute_with_ctx(ctx0, gf, n_threads); ggml_graph_prepare(gf, n_threads, nullptr);
ggml_graph_work_init(gf, ctx0);
ggml_graph_compute(gf);
const double f0 = ggml_get_f32_1d(f, 0); const double f0 = ggml_get_f32_1d(f, 0);
ggml_set_f32_1d(x[i], k, xm); ggml_set_f32_1d(x[i], k, xm);
ggml_graph_compute_with_ctx(ctx0, gf, n_threads); ggml_graph_prepare(gf, n_threads, nullptr);
ggml_graph_work_init(gf, ctx0);
ggml_graph_compute(gf);
const double f1 = ggml_get_f32_1d(f, 0); const double f1 = ggml_get_f32_1d(f, 0);
const double g0 = (f0 - f1)/(2.0*(double) eps); const double g0 = (f0 - f1)/(2.0*(double) eps);
@ -301,7 +309,9 @@ static bool check_gradient(
ggml_graph_reset (gf); ggml_graph_reset (gf);
ggml_set_f32 (f->grad, 1.0f); ggml_set_f32 (f->grad, 1.0f);
ggml_graph_compute_with_ctx(ctx0, gb, n_threads); ggml_graph_prepare(gb, n_threads, nullptr);
ggml_graph_work_init(gb, ctx0);
ggml_graph_compute(gb);
const double g1 = ggml_get_f32_1d(x[i]->grad, k); const double g1 = ggml_get_f32_1d(x[i]->grad, k);

View File

@ -113,7 +113,10 @@ int main(void) {
ggml_build_forward_expand(ge, e); ggml_build_forward_expand(ge, e);
ggml_graph_reset(ge); ggml_graph_reset(ge);
ggml_graph_compute_with_ctx(ctx, ge, /*n_threads*/ 1); ggml_graph_prepare(ge, 1, nullptr);
ggml_graph_work_init(ge, nullptr);
ggml_graph_compute(ge);
ggml_graph_work_free(ge);
const float fe = ggml_get_f32_1d(e, 0); const float fe = ggml_get_f32_1d(e, 0);
printf("%s: e = %.4f\n", __func__, fe); printf("%s: e = %.4f\n", __func__, fe);
@ -124,7 +127,10 @@ int main(void) {
ggml_graph_reset(ge); ggml_graph_reset(ge);
ggml_graph_compute_with_ctx(ctx, ge, /*n_threads*/ 1); ggml_graph_prepare(ge, 1, nullptr);
ggml_graph_work_init(ge, nullptr);
ggml_graph_compute(ge);
ggml_graph_work_free(ge);
const float fe_opt = ggml_get_f32_1d(e, 0); const float fe_opt = ggml_get_f32_1d(e, 0);
printf("%s: original e = %.4f\n", __func__, fe); printf("%s: original e = %.4f\n", __func__, fe);

View File

@ -112,17 +112,6 @@ static struct ggml_tensor * get_random_tensor_f32(
return result; return result;
} }
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
if (plan.work_size > 0) {
buf.resize(plan.work_size);
plan.work_data = buf.data();
}
ggml_graph_compute(graph, &plan);
}
int main(int /*argc*/, const char ** /*argv*/) { int main(int /*argc*/, const char ** /*argv*/) {
struct ggml_init_params params = { struct ggml_init_params params = {
/* .mem_size = */ 128*1024*1024, /* .mem_size = */ 128*1024*1024,
@ -130,8 +119,6 @@ int main(int /*argc*/, const char ** /*argv*/) {
/* .no_alloc = */ false, /* .no_alloc = */ false,
}; };
std::vector<uint8_t> work_buffer;
struct ggml_context * ctx0 = ggml_init(params); struct ggml_context * ctx0 = ggml_init(params);
struct ggml_tensor * x; struct ggml_tensor * x;
@ -175,7 +162,10 @@ int main(int /*argc*/, const char ** /*argv*/) {
ggml_build_forward_expand(gf, r1); ggml_build_forward_expand(gf, r1);
ggml_build_forward_expand(gf, r2); ggml_build_forward_expand(gf, r2);
ggml_graph_compute_helper(work_buffer, gf, 4); ggml_graph_prepare(gf, 4, nullptr);
ggml_graph_work_init(gf, nullptr);
ggml_graph_compute(gf);
ggml_graph_work_free(gf);
// check that r1 and r2 are the same // check that r1 and r2 are the same
{ {