allocator: automatic inplace operations

This commit is contained in:
slaren 2023-07-21 16:51:50 +02:00
parent 3d679827e7
commit e87840f9fd

View File

@ -10,6 +10,8 @@
//#define AT_PRINTF printf //#define AT_PRINTF printf
#define AT_PRINTF(...) ((void)0) #define AT_PRINTF(...) ((void)0)
// allocator // allocator
static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) { static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
@ -135,7 +137,33 @@ struct ggml_allocator_default_context {
size_t alignment; size_t alignment;
int n_free_blocks; int n_free_blocks;
struct free_block free_blocks[1024]; struct free_block free_blocks[1024];
#ifdef GGML_ALLOCATOR_DEBUG
struct ggml_tensor * allocated_tensors[1024];
#endif
}; };
#ifdef GGML_ALLOCATOR_DEBUG
void add_allocated_tensor(struct ggml_allocator_default_context * ctx, struct ggml_tensor * tensor) {
for (int i = 0; i < 1024; i++) {
if (ctx->allocated_tensors[i] == NULL) {
ctx->allocated_tensors[i] = tensor;
return;
}
}
GGML_ASSERT(!"out of allocated_tensors");
}
void remove_allocated_tensor(struct ggml_allocator_default_context * ctx, struct ggml_tensor * tensor) {
for (int i = 0; i < 1024; i++) {
if (ctx->allocated_tensors[i] == tensor ||
(ctx->allocated_tensors[i] != NULL && ctx->allocated_tensors[i]->data == tensor->data)) {
ctx->allocated_tensors[i] = NULL;
return;
}
}
printf("tried to free tensor %s not found\n", tensor->name);
GGML_ASSERT(!"tensor not found");
}
#endif
void ggml_allocator_default_free_buffer(struct ggml_backend_buffer * alloc) { void ggml_allocator_default_free_buffer(struct ggml_backend_buffer * alloc) {
struct ggml_allocator_default_context * allocator_ctx = (struct ggml_allocator_default_context *)alloc->context; struct ggml_allocator_default_context * allocator_ctx = (struct ggml_allocator_default_context *)alloc->context;
@ -149,9 +177,9 @@ void ggml_allocator_default_alloc_tensor(struct ggml_backend_buffer * alloc, str
///// /////
if (alloc->measure && allocator_ctx->size != MAX_SIZE_INIT) { if (alloc->measure && allocator_ctx->size != MAX_SIZE_INIT) {
allocator_ctx->size = MAX_SIZE_INIT; allocator_ctx->size = MAX_SIZE_INIT;
allocator_ctx->data = 0x1000; allocator_ctx->data = (void*) 0x1000;
allocator_ctx->free_blocks[0].size = MAX_SIZE_INIT; allocator_ctx->free_blocks[0].size = MAX_SIZE_INIT;
allocator_ctx->free_blocks[0].addr = 0x1000; allocator_ctx->free_blocks[0].addr = (void*) 0x1000;
} }
///// /////
@ -196,9 +224,25 @@ void ggml_allocator_default_alloc_tensor(struct ggml_backend_buffer * alloc, str
} }
} }
alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)allocator_ctx->data + size);
tensor->data = addr; tensor->data = addr;
#ifdef GGML_ALLOCATOR_DEBUG
add_allocated_tensor(allocator_ctx, tensor);
size_t cur_max = (char*)addr - (char*)allocator_ctx->data + size;
if (cur_max > alloc->max_size) {
fprintf(stderr, "max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
for (int i = 0; i < 1024; i++) {
if (allocator_ctx->allocated_tensors[i]) {
fprintf(stderr, "%s (%.2f MB) ", allocator_ctx->allocated_tensors[i]->name, ggml_nbytes(allocator_ctx->allocated_tensors[i]) / 1024.0 / 1024.0);
}
}
fprintf(stderr, "\n");
}
#endif
alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)allocator_ctx->data + size);
if (!alloc->measure) { if (!alloc->measure) {
if (alloc->interface.init_tensor) { if (alloc->interface.init_tensor) {
ggml_backend_buffer_init_tensor(alloc, tensor); ggml_backend_buffer_init_tensor(alloc, tensor);
@ -223,6 +267,10 @@ void ggml_allocator_default_free_tensor(struct ggml_backend_buffer * alloc, stru
AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, allocator_ctx->n_free_blocks); AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, allocator_ctx->n_free_blocks);
tensor->freed = true; tensor->freed = true;
#ifdef GGML_ALLOCATOR_DEBUG
remove_allocated_tensor(allocator_ctx, tensor);
#endif
// see if we can merge with an existing block // see if we can merge with an existing block
for (int i = 0; i < allocator_ctx->n_free_blocks; i++) { for (int i = 0; i < allocator_ctx->n_free_blocks; i++) {
struct free_block * block = &allocator_ctx->free_blocks[i]; struct free_block * block = &allocator_ctx->free_blocks[i];
@ -295,6 +343,9 @@ static const struct ggml_backend_buffer_interface ggml_allocator_default_interfa
struct ggml_backend_buffer * ggml_allocator_default_init(void * data, size_t size, size_t alignment) { struct ggml_backend_buffer * ggml_allocator_default_init(void * data, size_t size, size_t alignment) {
struct ggml_allocator_default_context * ctx = malloc(sizeof(struct ggml_allocator_default_context) /* + n_free_blocks * sizeof(struct free_block) */); struct ggml_allocator_default_context * ctx = malloc(sizeof(struct ggml_allocator_default_context) /* + n_free_blocks * sizeof(struct free_block) */);
// debug
memset(ctx, 0, sizeof(struct ggml_allocator_default_context));
ctx->data = data; ctx->data = data;
ctx->size = size; ctx->size = size;
ctx->alignment = alignment; ctx->alignment = alignment;
@ -815,6 +866,32 @@ void allocate_node(struct ggml_buffer * buffer, struct ggml_tensor * node) {
} }
} else { } else {
//printf("allocating tensor %s\n", node->name); //printf("allocating tensor %s\n", node->name);
// see if we can reuse a parent's buffer (inplace)
for (int i = 0; i < GGML_MAX_SRC; i++) {
struct ggml_tensor * parent = node->src[i];
if (parent == NULL) {
break;
}
// TODO: make a list of operations that can be safely made inplace
if (parent->data != NULL && parent->n_children == 1 && parent->n_views == 0 && ggml_are_same_layout(node, parent) && node->op != GGML_OP_MUL_MAT) {
if (ggml_is_view(parent)) {
struct ggml_tensor * ancestor = parent;
do {
ancestor = view_parent(ancestor);
} while (ggml_is_view(ancestor));
if (ancestor->n_views == 1 && ancestor->n_children == 0) {
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, ancestor->name, node->name);
node->data = ancestor->data;
return;
}
}
else {
node->data = parent->data;
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
}
return;
}
}
ggml_backend_buffer_tensor_alloc(buffer->backend_buffer, node); ggml_backend_buffer_tensor_alloc(buffer->backend_buffer, node);
} }
} }
@ -928,15 +1005,17 @@ void ggml_graph_allocate_tensors_n(struct ggml_cgraph ** graphs, int n_graphs, s
ancestor = view_parent(ancestor); ancestor = view_parent(ancestor);
} while (ggml_is_view(ancestor)); } while (ggml_is_view(ancestor));
ancestor->n_views -= 1; ancestor->n_views -= 1;
if (ancestor->n_views == 0 && ancestor->n_children == 0) { if (ancestor->n_views == 0 && ancestor->n_children == 0 && ancestor->data != node->data) {
ggml_backend_buffer_tensor_free(buffer->backend_buffer, ancestor); ggml_backend_buffer_tensor_free(buffer->backend_buffer, ancestor);
} }
} }
else { else {
if (parent->data != node->data) {
ggml_backend_buffer_tensor_free(buffer->backend_buffer, parent); ggml_backend_buffer_tensor_free(buffer->backend_buffer, parent);
} }
} }
} }
}
if (is_view) { if (is_view) {
struct ggml_tensor * ancestor = node; struct ggml_tensor * ancestor = node;