improved memory management fixes

This commit is contained in:
slaren 2023-07-21 12:41:46 +02:00
parent 56e9ae062c
commit 3d679827e7
4 changed files with 72 additions and 28 deletions

View File

@ -7,6 +7,9 @@
#define UNUSED(x) (void)(x) #define UNUSED(x) (void)(x)
//#define AT_PRINTF printf
#define AT_PRINTF(...) ((void)0)
// allocator // allocator
static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) { static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
@ -146,16 +149,16 @@ void ggml_allocator_default_alloc_tensor(struct ggml_backend_buffer * alloc, str
///// /////
if (alloc->measure && allocator_ctx->size != MAX_SIZE_INIT) { if (alloc->measure && allocator_ctx->size != MAX_SIZE_INIT) {
allocator_ctx->size = MAX_SIZE_INIT; allocator_ctx->size = MAX_SIZE_INIT;
//allocator_ctx->data = 0; allocator_ctx->data = 0x1000;
allocator_ctx->free_blocks[0].size = MAX_SIZE_INIT; allocator_ctx->free_blocks[0].size = MAX_SIZE_INIT;
//allocator_ctx->free_blocks[0].addr = 0; allocator_ctx->free_blocks[0].addr = 0x1000;
} }
///// /////
size_t size = ggml_backend_buffer_get_alloc_size(alloc, tensor); size_t size = ggml_backend_buffer_get_alloc_size(alloc, tensor);
size = aligned_offset(NULL, size, allocator_ctx->alignment); size = aligned_offset(NULL, size, allocator_ctx->alignment);
// printf("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size); AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
size_t max_avail = 0; size_t max_avail = 0;
@ -173,7 +176,7 @@ void ggml_allocator_default_alloc_tensor(struct ggml_backend_buffer * alloc, str
} }
} }
// printf("block %d\n", best_fit_block); AT_PRINTF("block %d\n", best_fit_block);
if (best_fit_block == -1) { if (best_fit_block == -1) {
fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n", fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
@ -217,7 +220,8 @@ void ggml_allocator_default_free_tensor(struct ggml_backend_buffer * alloc, stru
size_t size = ggml_backend_buffer_get_alloc_size(alloc, tensor); size_t size = ggml_backend_buffer_get_alloc_size(alloc, tensor);
size = aligned_offset(NULL, size, allocator_ctx->alignment); size = aligned_offset(NULL, size, allocator_ctx->alignment);
//printf("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, allocator_ctx->n_free_blocks); AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, allocator_ctx->n_free_blocks);
tensor->freed = true;
// see if we can merge with an existing block // see if we can merge with an existing block
for (int i = 0; i < allocator_ctx->n_free_blocks; i++) { for (int i = 0; i < allocator_ctx->n_free_blocks; i++) {
@ -826,11 +830,13 @@ void ggml_graph_allocate_tensors_n(struct ggml_cgraph ** graphs, int n_graphs, s
struct ggml_tensor * node = gf->nodes[i]; struct ggml_tensor * node = gf->nodes[i];
node->n_children = 0; node->n_children = 0;
node->n_views = 0; node->n_views = 0;
//node->freed = false;
} }
for (int i = 0; i < gf->n_leafs; i++) { for (int i = 0; i < gf->n_leafs; i++) {
struct ggml_tensor * leaf = gf->leafs[i]; struct ggml_tensor * leaf = gf->leafs[i];
leaf->n_children = 0; leaf->n_children = 0;
leaf->n_views = 0; leaf->n_views = 0;
//leaf->freed = false;
} }
} }
@ -839,6 +845,13 @@ void ggml_graph_allocate_tensors_n(struct ggml_cgraph ** graphs, int n_graphs, s
struct ggml_cgraph * gf = graphs[g]; struct ggml_cgraph * gf = graphs[g];
for (int i = 0; i < gf->n_nodes; i++) { for (int i = 0; i < gf->n_nodes; i++) {
struct ggml_tensor * node = gf->nodes[i]; struct ggml_tensor * node = gf->nodes[i];
if (ggml_is_view(node)) {
struct ggml_tensor * ancestor = node;
do {
ancestor = view_parent(ancestor);
} while (ggml_is_view(ancestor));
ancestor->n_views += 1;
}
for (int j = 0; j < GGML_MAX_SRC; j++) { for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * parent = node->src[j]; struct ggml_tensor * parent = node->src[j];
if (parent == NULL) { if (parent == NULL) {
@ -869,47 +882,74 @@ void ggml_graph_allocate_tensors_n(struct ggml_cgraph ** graphs, int n_graphs, s
if (parent == NULL) { if (parent == NULL) {
break; break;
} }
if (parent->freed) {
printf("!!!!!! tensor %s used after free\n", parent->name);
}
if (ggml_is_view(parent)) {
struct ggml_tensor * ancestor = parent;
do {
ancestor = view_parent(ancestor);
} while (ggml_is_view(ancestor));
if (ancestor->freed) {
printf("!!!!!! tensor %s used after free (as view %s)\n", ancestor->name, parent->name);
}
allocate_node(buffer, ancestor);
}
allocate_node(buffer, parent); allocate_node(buffer, parent);
} }
// allocate node // allocate node
allocate_node(buffer, node); allocate_node(buffer, node);
// update parents AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
if (is_view) { for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * ancestor = node; struct ggml_tensor * parent = node->src[j];
do { if (parent == NULL) {
ancestor = view_parent(ancestor); break;
} while (ggml_is_view(ancestor));
ancestor->n_views -= 1;
if (ancestor->n_views == 0) {
ggml_backend_buffer_tensor_free(buffer->backend_buffer, ancestor);
} }
} else { AT_PRINTF("%s", parent->name);
for (int j = 0; j < GGML_MAX_SRC; j++) { if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
struct ggml_tensor * parent = node->src[j]; AT_PRINTF(", ");
if (parent == NULL) { }
break; }
} AT_PRINTF("\n");
// update parents
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * parent = node->src[j];
if (parent == NULL) {
break;
}
parent->n_children -= 1;
if (parent->n_children == 0 && parent->n_views == 0) {
if (ggml_is_view(parent)) { if (ggml_is_view(parent)) {
struct ggml_tensor * ancestor = parent; struct ggml_tensor * ancestor = parent;
do { do {
ancestor = view_parent(ancestor); ancestor = view_parent(ancestor);
} while (ggml_is_view(ancestor)); } while (ggml_is_view(ancestor));
ancestor->n_views -= 1; ancestor->n_views -= 1;
if (ancestor->n_views == 0) { if (ancestor->n_views == 0 && ancestor->n_children == 0) {
ggml_backend_buffer_tensor_free(buffer->backend_buffer, ancestor); ggml_backend_buffer_tensor_free(buffer->backend_buffer, ancestor);
} }
} }
else { else {
parent->n_children -= 1; ggml_backend_buffer_tensor_free(buffer->backend_buffer, parent);
if (parent->n_children == 0) {
// free parent
ggml_backend_buffer_tensor_free(buffer->backend_buffer, parent);
}
} }
} }
} }
if (is_view) {
struct ggml_tensor * ancestor = node;
do {
ancestor = view_parent(ancestor);
} while (ggml_is_view(ancestor));
ancestor->n_views -= 1;
if (ancestor->n_views == 0 && ancestor->n_children == 0) {
ggml_backend_buffer_tensor_free(buffer->backend_buffer, ancestor);
}
}
AT_PRINTF("\n");
} }
} }
} }

1
ggml.c
View File

@ -4533,6 +4533,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
/*.node_id =*/ -1, /*.node_id =*/ -1,
/*.n_children =*/ 0, /*.n_children =*/ 0,
/*.n_views =*/ 0, /*.n_views =*/ 0,
/*.freed =*/ false,
/*.perf_runs =*/ 0, /*.perf_runs =*/ 0,
/*.perf_cycles =*/ 0, /*.perf_cycles =*/ 0,
/*.perf_time_us =*/ 0, /*.perf_time_us =*/ 0,

3
ggml.h
View File

@ -425,6 +425,7 @@ extern "C" {
int node_id; // used to build graphs int node_id; // used to build graphs
int n_children; int n_children;
int n_views; int n_views;
bool freed; // debug
// performance // performance
int perf_runs; int perf_runs;
@ -437,7 +438,7 @@ extern "C" {
void * extra; // extra things e.g. for ggml-cuda.cu void * extra; // extra things e.g. for ggml-cuda.cu
char padding[12]; char padding[8];
}; };
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);

View File

@ -703,7 +703,9 @@ static bool kv_cache_init(
const int64_t n_mem = n_layer*n_ctx; const int64_t n_mem = n_layer*n_ctx;
const int64_t n_elements = n_embd*n_mem; const int64_t n_elements = n_embd*n_mem;
size_t size = 2u*n_elements*ggml_type_size(wtype) + 2u*MB; size_t size = 2u*n_elements*ggml_type_size(wtype);
fprintf(stderr, "%s: allocating %.2f MB for kv cache\n", __func__, size / 1024.0 / 1024.0);
cache.buf = ggml_buffer_alloc(backend, size, 2); cache.buf = ggml_buffer_alloc(backend, size, 2);
cache.n = 0; cache.n = 0;