From 4dad9fa50e7d350a92dcc2f02926b62fe47e3843 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 26 Jan 2025 12:33:16 +0200 Subject: [PATCH] metal : use residency sets --- ggml/src/ggml-metal/ggml-metal.m | 167 ++++++++++++++++++++++++++++++- 1 file changed, 162 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index a85502ee0..2eacd55f9 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -19,6 +19,8 @@ // max number of MTLCommandBuffer used to submit a graph for processing #define GGML_METAL_MAX_COMMAND_BUFFERS 8 +#define GGML_METAL_MAX_RESIDENCY_SETS 128 + #define UNUSED(x) (void)(x) // globals @@ -37,6 +39,9 @@ static struct ggml_backend_metal_device_context { id mtl_device; int mtl_device_ref_count; + id mtl_residency_set[GGML_METAL_MAX_RESIDENCY_SETS]; + int mtl_residency_set_n; + bool has_simdgroup_reduction; bool has_simdgroup_mm; bool has_bfloat; @@ -46,6 +51,8 @@ static struct ggml_backend_metal_device_context { } g_ggml_ctx_dev_main = { /*.mtl_device =*/ nil, /*.mtl_device_ref_count =*/ 0, + /*.mtl_residency_set =*/ { nil }, + /*.mtl_residency_set_n =*/ 0, /*.has_simdgroup_reduction =*/ false, /*.has_simdgroup_mm =*/ false, /*.has_bfloat =*/ false, @@ -95,6 +102,41 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte } } +// add residency set +static bool ggml_backend_metal_device_add_residency_set(struct ggml_backend_metal_device_context * ctx, id residency_set) { + assert(ctx != NULL); + assert(queue != nil); + + if (ctx->mtl_residency_set_n >= GGML_METAL_MAX_RESIDENCY_SETS) { + GGML_LOG_ERROR("%s: warning: maximum number of residency sets reached\n", __func__); + return false; + } + + ctx->mtl_residency_set[ctx->mtl_residency_set_n++] = residency_set; + + return true; +} + +// remove residency set +static bool ggml_backend_metal_device_remove_residency_set(struct ggml_backend_metal_device_context * ctx, id residency_set) { + assert(ctx != NULL); + assert(residency_set != nil); + + for (int i = 0; i < ctx->mtl_residency_set_n; ++i) { + if (ctx->mtl_residency_set[i] == residency_set) { + for (int j = i; j < ctx->mtl_residency_set_n - 1; ++j) { + ctx->mtl_residency_set[j] = ctx->mtl_residency_set[j + 1]; + } + + ctx->mtl_residency_set_n--; + + return true; + } + } + + return false; +} + // kernels struct ggml_metal_kernel { @@ -483,6 +525,11 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]); ctx->queue = [device newCommandQueue]; + if (ctx->queue == nil) { + GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__); + return NULL; + } + ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT); id metal_library; @@ -1035,6 +1082,8 @@ struct ggml_backend_metal_buffer_context { // multiple buffers are used only to avoid the maximum buffer size limitation when using mmap int n_buffers; struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS]; + + id residency_set; }; // finds the Metal buffer that contains the tensor data on the GPU device @@ -4039,6 +4088,23 @@ static enum ggml_status ggml_metal_graph_compute( struct ggml_backend_metal_context * ctx = backend->context; struct ggml_backend_metal_device_context * ctx_dev = backend->device->context; + // attached residency sets to the queue on the first run + // also tested to attached them on each run, but it does not make a difference + static bool is_first = true; + if (is_first) { + is_first = false; + GGML_LOG_INFO("%s: adding %d residency sets\n", __func__, ctx_dev->mtl_residency_set_n); + [ctx->queue addResidencySets:ctx_dev->mtl_residency_set count:ctx_dev->mtl_residency_set_n]; + } + + // this does not make a difference + //for (int i = 0; i < ctx_dev->mtl_residency_set_n; ++i) { + // GGML_LOG_INFO("%s: residency set %d allocations size = %zu\n", __func__, i, [ctx_dev->mtl_residency_set[i] allocatedSize]); + // [ctx_dev->mtl_residency_set[i] requestResidency]; + //} + + int64_t t_start_us = ggml_time_us(); + // number of nodes encoded by the main thread (empirically determined) const int n_main = 128; @@ -4086,9 +4152,12 @@ static enum ggml_status ggml_metal_graph_compute( // the main thread commits the first few commands immediately // command_buffer[n_cb] { - id command_buffer = [ctx->queue commandBufferWithUnretainedReferences]; + id command_buffer = [ctx->queue commandBuffer]; ctx->command_buffers[n_cb] = command_buffer; + // does not make a difference + [command_buffer useResidencySets:ctx_dev->mtl_residency_set count:ctx_dev->mtl_residency_set_n]; + [command_buffer enqueue]; ctx->encode_async(n_cb); } @@ -4096,9 +4165,12 @@ static enum ggml_status ggml_metal_graph_compute( // prepare the rest of the command buffers asynchronously // command_buffer[0.. n_cb) for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) { - id command_buffer = [ctx->queue commandBufferWithUnretainedReferences]; + id command_buffer = [ctx->queue commandBuffer]; ctx->command_buffers[cb_idx] = command_buffer; + // does not make a difference + [command_buffer useResidencySets:ctx_dev->mtl_residency_set count:ctx_dev->mtl_residency_set_n]; + // always enqueue the first two command buffers // enqueue all of the command buffers if we don't need to abort if (cb_idx < 2 || ctx->abort_callback == NULL) { @@ -4163,6 +4235,10 @@ static enum ggml_status ggml_metal_graph_compute( } } + int64_t t_end_us = ggml_time_us(); + + GGML_LOG_DEBUG("%s: compute graph took %8.2f ms\n", __func__, (t_end_us - t_start_us) / 1000.0); + return GGML_STATUS_SUCCESS; } @@ -4176,6 +4252,13 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) for (int i = 0; i < ctx->n_buffers; i++) { [ctx->buffers[i].metal release]; } + + ggml_backend_metal_device_remove_residency_set(buffer->buft->device->context, ctx->residency_set); + + [ctx->residency_set endResidency]; + [ctx->residency_set removeAllAllocations]; + [ctx->residency_set release]; + ggml_backend_metal_device_rel(buffer->buft->device->context); if (ctx->owned) { @@ -4284,7 +4367,8 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba size_aligned += (size_page - (size_aligned % size_page)); } - id device = ggml_backend_metal_device_acq(buft->device->context); + struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)buft->device->context; + id device = ggml_backend_metal_device_acq(ctx_dev); ctx->all_data = ggml_metal_host_malloc(size_aligned); ctx->all_size = size_aligned; @@ -4307,10 +4391,34 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) { GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); free(ctx); - ggml_backend_metal_device_rel(buft->device->context); + ggml_backend_metal_device_rel(ctx_dev); return NULL; } + { + MTLResidencySetDescriptor * desc; + desc = [[MTLResidencySetDescriptor alloc] init]; + desc.label = @"Primary residency set"; + desc.initialCapacity = ctx->n_buffers; + + NSError *error; + ctx->residency_set = [device newResidencySetWithDescriptor:desc error:&error]; + if (error) { + GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); + return NULL; + } + + for (int i = 0; i < ctx->n_buffers; i++) { + [ctx->residency_set addAllocation:ctx->buffers[i].metal]; + } + + [ctx->residency_set commit]; + [ctx->residency_set requestResidency]; + + // track the residency set in the device context + ggml_backend_metal_device_add_residency_set(ctx_dev, ctx->residency_set); + } + //ggml_backend_metal_log_allocated_size(device, size_aligned); return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size); @@ -4400,7 +4508,8 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz size_aligned += (size_page - (size_aligned % size_page)); } - id device = ggml_backend_metal_device_acq(&g_ggml_ctx_dev_main); + struct ggml_backend_metal_device_context * ctx_dev = &g_ggml_ctx_dev_main; + id device = ggml_backend_metal_device_acq(ctx_dev); // the buffer fits into the max buffer size allowed by the device if (size_aligned <= device.maxBufferLength) { @@ -4453,6 +4562,30 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz } } + { + MTLResidencySetDescriptor * desc; + desc = [[MTLResidencySetDescriptor alloc] init]; + desc.label = @"Primary residency set"; + desc.initialCapacity = ctx->n_buffers; + + NSError *error; + ctx->residency_set = [device newResidencySetWithDescriptor:desc error:&error]; + if (error) { + GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); + return NULL; + } + + for (int i = 0; i < ctx->n_buffers; i++) { + [ctx->residency_set addAllocation:ctx->buffers[i].metal]; + } + + [ctx->residency_set commit]; + [ctx->residency_set requestResidency]; + + // track the residency set in the device context + ggml_backend_metal_device_add_residency_set(ctx_dev, ctx->residency_set); + } + return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size); } @@ -4766,6 +4899,30 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back } } + { + MTLResidencySetDescriptor * desc; + desc = [[MTLResidencySetDescriptor alloc] init]; + desc.label = @"Primary residency set"; + desc.initialCapacity = ctx->n_buffers; + + NSError *error; + ctx->residency_set = [device newResidencySetWithDescriptor:desc error:&error]; + if (error) { + GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); + return NULL; + } + + for (int i = 0; i < ctx->n_buffers; i++) { + [ctx->residency_set addAllocation:ctx->buffers[i].metal]; + } + + [ctx->residency_set commit]; + [ctx->residency_set requestResidency]; + + // track the residency set in the device context + ggml_backend_metal_device_add_residency_set(ctx_dev, ctx->residency_set); + } + return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size); }