mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-03 17:51:09 +01:00
improved memory management fixes
This commit is contained in:
parent
56e9ae062c
commit
3d679827e7
@ -7,6 +7,9 @@
|
||||
|
||||
#define UNUSED(x) (void)(x)
|
||||
|
||||
//#define AT_PRINTF printf
|
||||
#define AT_PRINTF(...) ((void)0)
|
||||
|
||||
// allocator
|
||||
|
||||
static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
|
||||
@ -146,16 +149,16 @@ void ggml_allocator_default_alloc_tensor(struct ggml_backend_buffer * alloc, str
|
||||
/////
|
||||
if (alloc->measure && allocator_ctx->size != MAX_SIZE_INIT) {
|
||||
allocator_ctx->size = MAX_SIZE_INIT;
|
||||
//allocator_ctx->data = 0;
|
||||
allocator_ctx->data = 0x1000;
|
||||
allocator_ctx->free_blocks[0].size = MAX_SIZE_INIT;
|
||||
//allocator_ctx->free_blocks[0].addr = 0;
|
||||
allocator_ctx->free_blocks[0].addr = 0x1000;
|
||||
}
|
||||
/////
|
||||
|
||||
size_t size = ggml_backend_buffer_get_alloc_size(alloc, tensor);
|
||||
size = aligned_offset(NULL, size, allocator_ctx->alignment);
|
||||
|
||||
// printf("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
|
||||
AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
|
||||
|
||||
size_t max_avail = 0;
|
||||
|
||||
@ -173,7 +176,7 @@ void ggml_allocator_default_alloc_tensor(struct ggml_backend_buffer * alloc, str
|
||||
}
|
||||
}
|
||||
|
||||
// printf("block %d\n", best_fit_block);
|
||||
AT_PRINTF("block %d\n", best_fit_block);
|
||||
|
||||
if (best_fit_block == -1) {
|
||||
fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
|
||||
@ -217,7 +220,8 @@ void ggml_allocator_default_free_tensor(struct ggml_backend_buffer * alloc, stru
|
||||
|
||||
size_t size = ggml_backend_buffer_get_alloc_size(alloc, tensor);
|
||||
size = aligned_offset(NULL, size, allocator_ctx->alignment);
|
||||
//printf("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, allocator_ctx->n_free_blocks);
|
||||
AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, allocator_ctx->n_free_blocks);
|
||||
tensor->freed = true;
|
||||
|
||||
// see if we can merge with an existing block
|
||||
for (int i = 0; i < allocator_ctx->n_free_blocks; i++) {
|
||||
@ -826,11 +830,13 @@ void ggml_graph_allocate_tensors_n(struct ggml_cgraph ** graphs, int n_graphs, s
|
||||
struct ggml_tensor * node = gf->nodes[i];
|
||||
node->n_children = 0;
|
||||
node->n_views = 0;
|
||||
//node->freed = false;
|
||||
}
|
||||
for (int i = 0; i < gf->n_leafs; i++) {
|
||||
struct ggml_tensor * leaf = gf->leafs[i];
|
||||
leaf->n_children = 0;
|
||||
leaf->n_views = 0;
|
||||
//leaf->freed = false;
|
||||
}
|
||||
}
|
||||
|
||||
@ -839,6 +845,13 @@ void ggml_graph_allocate_tensors_n(struct ggml_cgraph ** graphs, int n_graphs, s
|
||||
struct ggml_cgraph * gf = graphs[g];
|
||||
for (int i = 0; i < gf->n_nodes; i++) {
|
||||
struct ggml_tensor * node = gf->nodes[i];
|
||||
if (ggml_is_view(node)) {
|
||||
struct ggml_tensor * ancestor = node;
|
||||
do {
|
||||
ancestor = view_parent(ancestor);
|
||||
} while (ggml_is_view(ancestor));
|
||||
ancestor->n_views += 1;
|
||||
}
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * parent = node->src[j];
|
||||
if (parent == NULL) {
|
||||
@ -869,47 +882,74 @@ void ggml_graph_allocate_tensors_n(struct ggml_cgraph ** graphs, int n_graphs, s
|
||||
if (parent == NULL) {
|
||||
break;
|
||||
}
|
||||
if (parent->freed) {
|
||||
printf("!!!!!! tensor %s used after free\n", parent->name);
|
||||
}
|
||||
if (ggml_is_view(parent)) {
|
||||
struct ggml_tensor * ancestor = parent;
|
||||
do {
|
||||
ancestor = view_parent(ancestor);
|
||||
} while (ggml_is_view(ancestor));
|
||||
if (ancestor->freed) {
|
||||
printf("!!!!!! tensor %s used after free (as view %s)\n", ancestor->name, parent->name);
|
||||
}
|
||||
allocate_node(buffer, ancestor);
|
||||
}
|
||||
allocate_node(buffer, parent);
|
||||
}
|
||||
|
||||
// allocate node
|
||||
allocate_node(buffer, node);
|
||||
|
||||
// update parents
|
||||
if (is_view) {
|
||||
struct ggml_tensor * ancestor = node;
|
||||
do {
|
||||
ancestor = view_parent(ancestor);
|
||||
} while (ggml_is_view(ancestor));
|
||||
ancestor->n_views -= 1;
|
||||
if (ancestor->n_views == 0) {
|
||||
ggml_backend_buffer_tensor_free(buffer->backend_buffer, ancestor);
|
||||
AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * parent = node->src[j];
|
||||
if (parent == NULL) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * parent = node->src[j];
|
||||
if (parent == NULL) {
|
||||
break;
|
||||
}
|
||||
AT_PRINTF("%s", parent->name);
|
||||
if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
|
||||
AT_PRINTF(", ");
|
||||
}
|
||||
}
|
||||
AT_PRINTF("\n");
|
||||
|
||||
// update parents
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * parent = node->src[j];
|
||||
if (parent == NULL) {
|
||||
break;
|
||||
}
|
||||
parent->n_children -= 1;
|
||||
if (parent->n_children == 0 && parent->n_views == 0) {
|
||||
if (ggml_is_view(parent)) {
|
||||
struct ggml_tensor * ancestor = parent;
|
||||
do {
|
||||
ancestor = view_parent(ancestor);
|
||||
} while (ggml_is_view(ancestor));
|
||||
ancestor->n_views -= 1;
|
||||
if (ancestor->n_views == 0) {
|
||||
if (ancestor->n_views == 0 && ancestor->n_children == 0) {
|
||||
ggml_backend_buffer_tensor_free(buffer->backend_buffer, ancestor);
|
||||
}
|
||||
}
|
||||
else {
|
||||
parent->n_children -= 1;
|
||||
if (parent->n_children == 0) {
|
||||
// free parent
|
||||
ggml_backend_buffer_tensor_free(buffer->backend_buffer, parent);
|
||||
}
|
||||
ggml_backend_buffer_tensor_free(buffer->backend_buffer, parent);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (is_view) {
|
||||
struct ggml_tensor * ancestor = node;
|
||||
do {
|
||||
ancestor = view_parent(ancestor);
|
||||
} while (ggml_is_view(ancestor));
|
||||
ancestor->n_views -= 1;
|
||||
if (ancestor->n_views == 0 && ancestor->n_children == 0) {
|
||||
ggml_backend_buffer_tensor_free(buffer->backend_buffer, ancestor);
|
||||
}
|
||||
}
|
||||
|
||||
AT_PRINTF("\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
1
ggml.c
1
ggml.c
@ -4533,6 +4533,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
||||
/*.node_id =*/ -1,
|
||||
/*.n_children =*/ 0,
|
||||
/*.n_views =*/ 0,
|
||||
/*.freed =*/ false,
|
||||
/*.perf_runs =*/ 0,
|
||||
/*.perf_cycles =*/ 0,
|
||||
/*.perf_time_us =*/ 0,
|
||||
|
3
ggml.h
3
ggml.h
@ -425,6 +425,7 @@ extern "C" {
|
||||
int node_id; // used to build graphs
|
||||
int n_children;
|
||||
int n_views;
|
||||
bool freed; // debug
|
||||
|
||||
// performance
|
||||
int perf_runs;
|
||||
@ -437,7 +438,7 @@ extern "C" {
|
||||
|
||||
void * extra; // extra things e.g. for ggml-cuda.cu
|
||||
|
||||
char padding[12];
|
||||
char padding[8];
|
||||
};
|
||||
|
||||
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
||||
|
@ -703,7 +703,9 @@ static bool kv_cache_init(
|
||||
const int64_t n_mem = n_layer*n_ctx;
|
||||
const int64_t n_elements = n_embd*n_mem;
|
||||
|
||||
size_t size = 2u*n_elements*ggml_type_size(wtype) + 2u*MB;
|
||||
size_t size = 2u*n_elements*ggml_type_size(wtype);
|
||||
|
||||
fprintf(stderr, "%s: allocating %.2f MB for kv cache\n", __func__, size / 1024.0 / 1024.0);
|
||||
|
||||
cache.buf = ggml_buffer_alloc(backend, size, 2);
|
||||
cache.n = 0;
|
||||
|
Loading…
Reference in New Issue
Block a user