From e87840f9fdf3f2a958030c64a270321489162fa8 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Fri, 21 Jul 2023 16:51:50 +0200
Subject: [PATCH] allocator: automatic inplace operations

---
 ggml-backend.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 84 insertions(+), 5 deletions(-)

diff --git a/ggml-backend.c b/ggml-backend.c
index a8fc3632b..79c16d4be 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -10,6 +10,8 @@
 //#define AT_PRINTF printf
 #define AT_PRINTF(...) ((void)0)
 
+
+
 // allocator
 
 static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
@@ -135,7 +137,33 @@ struct ggml_allocator_default_context {
     size_t alignment;
     int n_free_blocks;
     struct free_block free_blocks[1024];
+
+#ifdef GGML_ALLOCATOR_DEBUG
+    struct ggml_tensor * allocated_tensors[1024];
+#endif
 };
+#ifdef GGML_ALLOCATOR_DEBUG
+void add_allocated_tensor(struct ggml_allocator_default_context * ctx, struct ggml_tensor * tensor) {
+    for (int i = 0; i < 1024; i++) {
+        if (ctx->allocated_tensors[i] == NULL) {
+            ctx->allocated_tensors[i] = tensor;
+            return;
+        }
+    }
+    GGML_ASSERT(!"out of allocated_tensors");
+}
+void remove_allocated_tensor(struct ggml_allocator_default_context * ctx, struct ggml_tensor * tensor) {
+    for (int i = 0; i < 1024; i++) {
+        if (ctx->allocated_tensors[i] == tensor ||
+            (ctx->allocated_tensors[i] != NULL && ctx->allocated_tensors[i]->data == tensor->data)) {
+            ctx->allocated_tensors[i] = NULL;
+            return;
+        }
+    }
+    printf("tried to free tensor %s not found\n", tensor->name);
+    GGML_ASSERT(!"tensor not found");
+}
+#endif
 
 void ggml_allocator_default_free_buffer(struct ggml_backend_buffer * alloc) {
     struct ggml_allocator_default_context * allocator_ctx = (struct ggml_allocator_default_context *)alloc->context;
@@ -149,9 +177,9 @@ void ggml_allocator_default_alloc_tensor(struct ggml_backend_buffer * alloc, str
     /////
     if (alloc->measure && allocator_ctx->size != MAX_SIZE_INIT) {
         allocator_ctx->size = MAX_SIZE_INIT;
-        allocator_ctx->data = 0x1000;
+        allocator_ctx->data = (void*) 0x1000;
         allocator_ctx->free_blocks[0].size = MAX_SIZE_INIT;
-        allocator_ctx->free_blocks[0].addr = 0x1000;
+        allocator_ctx->free_blocks[0].addr = (void*) 0x1000;
     }
     /////
 
@@ -196,9 +224,25 @@ void ggml_allocator_default_alloc_tensor(struct ggml_backend_buffer * alloc, str
         }
     }
 
-    alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)allocator_ctx->data + size);
     tensor->data = addr;
 
+#ifdef GGML_ALLOCATOR_DEBUG
+    add_allocated_tensor(allocator_ctx, tensor);
+    size_t cur_max = (char*)addr - (char*)allocator_ctx->data + size;
+    if (cur_max > alloc->max_size) {
+        fprintf(stderr, "max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
+        for (int i = 0; i < 1024; i++) {
+            if (allocator_ctx->allocated_tensors[i]) {
+                fprintf(stderr, "%s (%.2f MB) ", allocator_ctx->allocated_tensors[i]->name, ggml_nbytes(allocator_ctx->allocated_tensors[i]) / 1024.0 / 1024.0);
+            }
+        }
+        fprintf(stderr, "\n");
+    }
+#endif
+
+    alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)allocator_ctx->data + size);
+
+
     if (!alloc->measure) {
         if (alloc->interface.init_tensor) {
             ggml_backend_buffer_init_tensor(alloc, tensor);
@@ -223,6 +267,10 @@ void ggml_allocator_default_free_tensor(struct ggml_backend_buffer * alloc, stru
     AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, allocator_ctx->n_free_blocks);
     tensor->freed = true;
 
+#ifdef GGML_ALLOCATOR_DEBUG
+    remove_allocated_tensor(allocator_ctx, tensor);
+#endif
+
     // see if we can merge with an existing block
     for (int i = 0; i < allocator_ctx->n_free_blocks; i++) {
         struct free_block * block = &allocator_ctx->free_blocks[i];
@@ -295,6 +343,9 @@ static const struct ggml_backend_buffer_interface ggml_allocator_default_interfa
 
 struct ggml_backend_buffer * ggml_allocator_default_init(void * data, size_t size, size_t alignment) {
     struct ggml_allocator_default_context * ctx = malloc(sizeof(struct ggml_allocator_default_context) /* + n_free_blocks * sizeof(struct free_block) */);
+    // debug
+    memset(ctx, 0, sizeof(struct ggml_allocator_default_context));
+
     ctx->data = data;
     ctx->size = size;
     ctx->alignment = alignment;
@@ -815,6 +866,32 @@ void allocate_node(struct ggml_buffer * buffer, struct ggml_tensor * node) {
             }
         } else {
             //printf("allocating tensor %s\n", node->name);
+            // see if we can reuse a parent's buffer (inplace)
+            for (int i = 0; i < GGML_MAX_SRC; i++) {
+                struct ggml_tensor * parent = node->src[i];
+                if (parent == NULL) {
+                    break;
+                }
+                // TODO: make a list of operations that can be safely made inplace
+                if (parent->data != NULL && parent->n_children == 1 && parent->n_views == 0 && ggml_are_same_layout(node, parent) && node->op != GGML_OP_MUL_MAT) {
+                    if (ggml_is_view(parent)) {
+                        struct ggml_tensor * ancestor = parent;
+                        do {
+                            ancestor = view_parent(ancestor);
+                        } while (ggml_is_view(ancestor));
+                        if (ancestor->n_views == 1 && ancestor->n_children == 0) {
+                            AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, ancestor->name, node->name);
+                            node->data = ancestor->data;
+                            return;
+                        }
+                    }
+                    else {
+                        node->data = parent->data;
+                        AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
+                    }
+                    return;
+                }
+            }
             ggml_backend_buffer_tensor_alloc(buffer->backend_buffer, node);
         }
     }
@@ -928,12 +1005,14 @@ void ggml_graph_allocate_tensors_n(struct ggml_cgraph ** graphs, int n_graphs, s
                             ancestor = view_parent(ancestor);
                         } while (ggml_is_view(ancestor));
                         ancestor->n_views -= 1;
-                        if (ancestor->n_views == 0 && ancestor->n_children == 0) {
+                        if (ancestor->n_views == 0 && ancestor->n_children == 0 && ancestor->data != node->data) {
                             ggml_backend_buffer_tensor_free(buffer->backend_buffer, ancestor);
                         }
                     }
                     else {
-                        ggml_backend_buffer_tensor_free(buffer->backend_buffer, parent);
+                        if (parent->data != node->data) {
+                            ggml_backend_buffer_tensor_free(buffer->backend_buffer, parent);
+                        }
                     }
                 }
             }