From e343b8b4d81339d76c156006cec575e0de30bb8c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 14 Sep 2023 18:00:03 +0300 Subject: [PATCH] metal : revert the concurrnecy change because it was wrong --- examples/metal/metal.cpp | 4 ++-- ggml-metal.h | 2 +- ggml-metal.m | 5 ++--- llama.cpp | 2 +- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/examples/metal/metal.cpp b/examples/metal/metal.cpp index 50a651fa1..c05a4fa93 100644 --- a/examples/metal/metal.cpp +++ b/examples/metal/metal.cpp @@ -52,7 +52,7 @@ int main(int argc, char ** argv) { ggml_metal_set_tensor(ctx_metal, input); // warmup - ggml_metal_graph_compute(ctx_metal, &gf, false); + ggml_metal_graph_compute(ctx_metal, &gf); const int n_iter = 16; @@ -60,7 +60,7 @@ int main(int argc, char ** argv) { // the actual inference happens here for (int i = 0; i < n_iter; ++i) { - ggml_metal_graph_compute(ctx_metal, &gf, false); + ggml_metal_graph_compute(ctx_metal, &gf); } const int64_t t1 = ggml_time_us(); diff --git a/ggml-metal.h b/ggml-metal.h index 4e36cc129..fca28d37e 100644 --- a/ggml-metal.h +++ b/ggml-metal.h @@ -77,7 +77,7 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx); // same as ggml_graph_compute but uses Metal // creates gf->n_threads command buffers in parallel -void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf, bool concurrent); +void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf); #ifdef __cplusplus } diff --git a/ggml-metal.m b/ggml-metal.m index 7ec31c21b..b438b83f9 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -610,15 +610,14 @@ void ggml_metal_graph_find_concurrency( void ggml_metal_graph_compute( struct ggml_metal_context * ctx, - struct ggml_cgraph * gf, - bool concurrent) { + struct ggml_cgraph * gf) { @autoreleasepool { // if there is ctx->concur_list, dispatch concurrently // else fallback to serial dispatch MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor; - const bool has_concur = concurrent && ctx->concur_list_len && ctx->concur_list_len <= GGML_MAX_CONCUR; + const bool has_concur = ctx->concur_list_len && ctx->concur_list_len <= GGML_MAX_CONCUR; const int n_nodes = has_concur ? ctx->concur_list_len : gf->n_nodes; edesc.dispatchType = has_concur ? MTLDispatchTypeConcurrent : MTLDispatchTypeSerial; diff --git a/llama.cpp b/llama.cpp index 80b9993f4..907d130f9 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3012,7 +3012,7 @@ static bool llama_eval_internal( #ifdef GGML_USE_METAL if (lctx.ctx_metal) { ggml_metal_set_n_cb (lctx.ctx_metal, n_threads); - ggml_metal_graph_compute(lctx.ctx_metal, gf, n_tokens > 1); + ggml_metal_graph_compute(lctx.ctx_metal, gf); } else { ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads); }