mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-07 11:23:56 +01:00
allocator: automatic inplace operations
This commit is contained in:
parent
3d679827e7
commit
e87840f9fd
@ -10,6 +10,8 @@
|
|||||||
//#define AT_PRINTF printf
|
//#define AT_PRINTF printf
|
||||||
#define AT_PRINTF(...) ((void)0)
|
#define AT_PRINTF(...) ((void)0)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// allocator
|
// allocator
|
||||||
|
|
||||||
static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
|
static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
|
||||||
@ -135,7 +137,33 @@ struct ggml_allocator_default_context {
|
|||||||
size_t alignment;
|
size_t alignment;
|
||||||
int n_free_blocks;
|
int n_free_blocks;
|
||||||
struct free_block free_blocks[1024];
|
struct free_block free_blocks[1024];
|
||||||
|
|
||||||
|
#ifdef GGML_ALLOCATOR_DEBUG
|
||||||
|
struct ggml_tensor * allocated_tensors[1024];
|
||||||
|
#endif
|
||||||
};
|
};
|
||||||
|
#ifdef GGML_ALLOCATOR_DEBUG
|
||||||
|
void add_allocated_tensor(struct ggml_allocator_default_context * ctx, struct ggml_tensor * tensor) {
|
||||||
|
for (int i = 0; i < 1024; i++) {
|
||||||
|
if (ctx->allocated_tensors[i] == NULL) {
|
||||||
|
ctx->allocated_tensors[i] = tensor;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
GGML_ASSERT(!"out of allocated_tensors");
|
||||||
|
}
|
||||||
|
void remove_allocated_tensor(struct ggml_allocator_default_context * ctx, struct ggml_tensor * tensor) {
|
||||||
|
for (int i = 0; i < 1024; i++) {
|
||||||
|
if (ctx->allocated_tensors[i] == tensor ||
|
||||||
|
(ctx->allocated_tensors[i] != NULL && ctx->allocated_tensors[i]->data == tensor->data)) {
|
||||||
|
ctx->allocated_tensors[i] = NULL;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printf("tried to free tensor %s not found\n", tensor->name);
|
||||||
|
GGML_ASSERT(!"tensor not found");
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
void ggml_allocator_default_free_buffer(struct ggml_backend_buffer * alloc) {
|
void ggml_allocator_default_free_buffer(struct ggml_backend_buffer * alloc) {
|
||||||
struct ggml_allocator_default_context * allocator_ctx = (struct ggml_allocator_default_context *)alloc->context;
|
struct ggml_allocator_default_context * allocator_ctx = (struct ggml_allocator_default_context *)alloc->context;
|
||||||
@ -149,9 +177,9 @@ void ggml_allocator_default_alloc_tensor(struct ggml_backend_buffer * alloc, str
|
|||||||
/////
|
/////
|
||||||
if (alloc->measure && allocator_ctx->size != MAX_SIZE_INIT) {
|
if (alloc->measure && allocator_ctx->size != MAX_SIZE_INIT) {
|
||||||
allocator_ctx->size = MAX_SIZE_INIT;
|
allocator_ctx->size = MAX_SIZE_INIT;
|
||||||
allocator_ctx->data = 0x1000;
|
allocator_ctx->data = (void*) 0x1000;
|
||||||
allocator_ctx->free_blocks[0].size = MAX_SIZE_INIT;
|
allocator_ctx->free_blocks[0].size = MAX_SIZE_INIT;
|
||||||
allocator_ctx->free_blocks[0].addr = 0x1000;
|
allocator_ctx->free_blocks[0].addr = (void*) 0x1000;
|
||||||
}
|
}
|
||||||
/////
|
/////
|
||||||
|
|
||||||
@ -196,9 +224,25 @@ void ggml_allocator_default_alloc_tensor(struct ggml_backend_buffer * alloc, str
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)allocator_ctx->data + size);
|
|
||||||
tensor->data = addr;
|
tensor->data = addr;
|
||||||
|
|
||||||
|
#ifdef GGML_ALLOCATOR_DEBUG
|
||||||
|
add_allocated_tensor(allocator_ctx, tensor);
|
||||||
|
size_t cur_max = (char*)addr - (char*)allocator_ctx->data + size;
|
||||||
|
if (cur_max > alloc->max_size) {
|
||||||
|
fprintf(stderr, "max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
|
||||||
|
for (int i = 0; i < 1024; i++) {
|
||||||
|
if (allocator_ctx->allocated_tensors[i]) {
|
||||||
|
fprintf(stderr, "%s (%.2f MB) ", allocator_ctx->allocated_tensors[i]->name, ggml_nbytes(allocator_ctx->allocated_tensors[i]) / 1024.0 / 1024.0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)allocator_ctx->data + size);
|
||||||
|
|
||||||
|
|
||||||
if (!alloc->measure) {
|
if (!alloc->measure) {
|
||||||
if (alloc->interface.init_tensor) {
|
if (alloc->interface.init_tensor) {
|
||||||
ggml_backend_buffer_init_tensor(alloc, tensor);
|
ggml_backend_buffer_init_tensor(alloc, tensor);
|
||||||
@ -223,6 +267,10 @@ void ggml_allocator_default_free_tensor(struct ggml_backend_buffer * alloc, stru
|
|||||||
AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, allocator_ctx->n_free_blocks);
|
AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, allocator_ctx->n_free_blocks);
|
||||||
tensor->freed = true;
|
tensor->freed = true;
|
||||||
|
|
||||||
|
#ifdef GGML_ALLOCATOR_DEBUG
|
||||||
|
remove_allocated_tensor(allocator_ctx, tensor);
|
||||||
|
#endif
|
||||||
|
|
||||||
// see if we can merge with an existing block
|
// see if we can merge with an existing block
|
||||||
for (int i = 0; i < allocator_ctx->n_free_blocks; i++) {
|
for (int i = 0; i < allocator_ctx->n_free_blocks; i++) {
|
||||||
struct free_block * block = &allocator_ctx->free_blocks[i];
|
struct free_block * block = &allocator_ctx->free_blocks[i];
|
||||||
@ -295,6 +343,9 @@ static const struct ggml_backend_buffer_interface ggml_allocator_default_interfa
|
|||||||
|
|
||||||
struct ggml_backend_buffer * ggml_allocator_default_init(void * data, size_t size, size_t alignment) {
|
struct ggml_backend_buffer * ggml_allocator_default_init(void * data, size_t size, size_t alignment) {
|
||||||
struct ggml_allocator_default_context * ctx = malloc(sizeof(struct ggml_allocator_default_context) /* + n_free_blocks * sizeof(struct free_block) */);
|
struct ggml_allocator_default_context * ctx = malloc(sizeof(struct ggml_allocator_default_context) /* + n_free_blocks * sizeof(struct free_block) */);
|
||||||
|
// debug
|
||||||
|
memset(ctx, 0, sizeof(struct ggml_allocator_default_context));
|
||||||
|
|
||||||
ctx->data = data;
|
ctx->data = data;
|
||||||
ctx->size = size;
|
ctx->size = size;
|
||||||
ctx->alignment = alignment;
|
ctx->alignment = alignment;
|
||||||
@ -815,6 +866,32 @@ void allocate_node(struct ggml_buffer * buffer, struct ggml_tensor * node) {
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
//printf("allocating tensor %s\n", node->name);
|
//printf("allocating tensor %s\n", node->name);
|
||||||
|
// see if we can reuse a parent's buffer (inplace)
|
||||||
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||||
|
struct ggml_tensor * parent = node->src[i];
|
||||||
|
if (parent == NULL) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// TODO: make a list of operations that can be safely made inplace
|
||||||
|
if (parent->data != NULL && parent->n_children == 1 && parent->n_views == 0 && ggml_are_same_layout(node, parent) && node->op != GGML_OP_MUL_MAT) {
|
||||||
|
if (ggml_is_view(parent)) {
|
||||||
|
struct ggml_tensor * ancestor = parent;
|
||||||
|
do {
|
||||||
|
ancestor = view_parent(ancestor);
|
||||||
|
} while (ggml_is_view(ancestor));
|
||||||
|
if (ancestor->n_views == 1 && ancestor->n_children == 0) {
|
||||||
|
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, ancestor->name, node->name);
|
||||||
|
node->data = ancestor->data;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
node->data = parent->data;
|
||||||
|
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
ggml_backend_buffer_tensor_alloc(buffer->backend_buffer, node);
|
ggml_backend_buffer_tensor_alloc(buffer->backend_buffer, node);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -928,15 +1005,17 @@ void ggml_graph_allocate_tensors_n(struct ggml_cgraph ** graphs, int n_graphs, s
|
|||||||
ancestor = view_parent(ancestor);
|
ancestor = view_parent(ancestor);
|
||||||
} while (ggml_is_view(ancestor));
|
} while (ggml_is_view(ancestor));
|
||||||
ancestor->n_views -= 1;
|
ancestor->n_views -= 1;
|
||||||
if (ancestor->n_views == 0 && ancestor->n_children == 0) {
|
if (ancestor->n_views == 0 && ancestor->n_children == 0 && ancestor->data != node->data) {
|
||||||
ggml_backend_buffer_tensor_free(buffer->backend_buffer, ancestor);
|
ggml_backend_buffer_tensor_free(buffer->backend_buffer, ancestor);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
if (parent->data != node->data) {
|
||||||
ggml_backend_buffer_tensor_free(buffer->backend_buffer, parent);
|
ggml_backend_buffer_tensor_free(buffer->backend_buffer, parent);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (is_view) {
|
if (is_view) {
|
||||||
struct ggml_tensor * ancestor = node;
|
struct ggml_tensor * ancestor = node;
|
||||||
|
Loading…
Reference in New Issue
Block a user