mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-07 11:23:56 +01:00
improved memory management fixes
This commit is contained in:
parent
56e9ae062c
commit
3d679827e7
100
ggml-backend.c
100
ggml-backend.c
@ -7,6 +7,9 @@
|
|||||||
|
|
||||||
#define UNUSED(x) (void)(x)
|
#define UNUSED(x) (void)(x)
|
||||||
|
|
||||||
|
//#define AT_PRINTF printf
|
||||||
|
#define AT_PRINTF(...) ((void)0)
|
||||||
|
|
||||||
// allocator
|
// allocator
|
||||||
|
|
||||||
static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
|
static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
|
||||||
@ -146,16 +149,16 @@ void ggml_allocator_default_alloc_tensor(struct ggml_backend_buffer * alloc, str
|
|||||||
/////
|
/////
|
||||||
if (alloc->measure && allocator_ctx->size != MAX_SIZE_INIT) {
|
if (alloc->measure && allocator_ctx->size != MAX_SIZE_INIT) {
|
||||||
allocator_ctx->size = MAX_SIZE_INIT;
|
allocator_ctx->size = MAX_SIZE_INIT;
|
||||||
//allocator_ctx->data = 0;
|
allocator_ctx->data = 0x1000;
|
||||||
allocator_ctx->free_blocks[0].size = MAX_SIZE_INIT;
|
allocator_ctx->free_blocks[0].size = MAX_SIZE_INIT;
|
||||||
//allocator_ctx->free_blocks[0].addr = 0;
|
allocator_ctx->free_blocks[0].addr = 0x1000;
|
||||||
}
|
}
|
||||||
/////
|
/////
|
||||||
|
|
||||||
size_t size = ggml_backend_buffer_get_alloc_size(alloc, tensor);
|
size_t size = ggml_backend_buffer_get_alloc_size(alloc, tensor);
|
||||||
size = aligned_offset(NULL, size, allocator_ctx->alignment);
|
size = aligned_offset(NULL, size, allocator_ctx->alignment);
|
||||||
|
|
||||||
// printf("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
|
AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
|
||||||
|
|
||||||
size_t max_avail = 0;
|
size_t max_avail = 0;
|
||||||
|
|
||||||
@ -173,7 +176,7 @@ void ggml_allocator_default_alloc_tensor(struct ggml_backend_buffer * alloc, str
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// printf("block %d\n", best_fit_block);
|
AT_PRINTF("block %d\n", best_fit_block);
|
||||||
|
|
||||||
if (best_fit_block == -1) {
|
if (best_fit_block == -1) {
|
||||||
fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
|
fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
|
||||||
@ -217,7 +220,8 @@ void ggml_allocator_default_free_tensor(struct ggml_backend_buffer * alloc, stru
|
|||||||
|
|
||||||
size_t size = ggml_backend_buffer_get_alloc_size(alloc, tensor);
|
size_t size = ggml_backend_buffer_get_alloc_size(alloc, tensor);
|
||||||
size = aligned_offset(NULL, size, allocator_ctx->alignment);
|
size = aligned_offset(NULL, size, allocator_ctx->alignment);
|
||||||
//printf("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, allocator_ctx->n_free_blocks);
|
AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, allocator_ctx->n_free_blocks);
|
||||||
|
tensor->freed = true;
|
||||||
|
|
||||||
// see if we can merge with an existing block
|
// see if we can merge with an existing block
|
||||||
for (int i = 0; i < allocator_ctx->n_free_blocks; i++) {
|
for (int i = 0; i < allocator_ctx->n_free_blocks; i++) {
|
||||||
@ -826,11 +830,13 @@ void ggml_graph_allocate_tensors_n(struct ggml_cgraph ** graphs, int n_graphs, s
|
|||||||
struct ggml_tensor * node = gf->nodes[i];
|
struct ggml_tensor * node = gf->nodes[i];
|
||||||
node->n_children = 0;
|
node->n_children = 0;
|
||||||
node->n_views = 0;
|
node->n_views = 0;
|
||||||
|
//node->freed = false;
|
||||||
}
|
}
|
||||||
for (int i = 0; i < gf->n_leafs; i++) {
|
for (int i = 0; i < gf->n_leafs; i++) {
|
||||||
struct ggml_tensor * leaf = gf->leafs[i];
|
struct ggml_tensor * leaf = gf->leafs[i];
|
||||||
leaf->n_children = 0;
|
leaf->n_children = 0;
|
||||||
leaf->n_views = 0;
|
leaf->n_views = 0;
|
||||||
|
//leaf->freed = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -839,6 +845,13 @@ void ggml_graph_allocate_tensors_n(struct ggml_cgraph ** graphs, int n_graphs, s
|
|||||||
struct ggml_cgraph * gf = graphs[g];
|
struct ggml_cgraph * gf = graphs[g];
|
||||||
for (int i = 0; i < gf->n_nodes; i++) {
|
for (int i = 0; i < gf->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = gf->nodes[i];
|
struct ggml_tensor * node = gf->nodes[i];
|
||||||
|
if (ggml_is_view(node)) {
|
||||||
|
struct ggml_tensor * ancestor = node;
|
||||||
|
do {
|
||||||
|
ancestor = view_parent(ancestor);
|
||||||
|
} while (ggml_is_view(ancestor));
|
||||||
|
ancestor->n_views += 1;
|
||||||
|
}
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
struct ggml_tensor * parent = node->src[j];
|
struct ggml_tensor * parent = node->src[j];
|
||||||
if (parent == NULL) {
|
if (parent == NULL) {
|
||||||
@ -869,48 +882,75 @@ void ggml_graph_allocate_tensors_n(struct ggml_cgraph ** graphs, int n_graphs, s
|
|||||||
if (parent == NULL) {
|
if (parent == NULL) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
allocate_node(buffer, parent);
|
if (parent->freed) {
|
||||||
}
|
printf("!!!!!! tensor %s used after free\n", parent->name);
|
||||||
|
|
||||||
// allocate node
|
|
||||||
allocate_node(buffer, node);
|
|
||||||
|
|
||||||
// update parents
|
|
||||||
if (is_view) {
|
|
||||||
struct ggml_tensor * ancestor = node;
|
|
||||||
do {
|
|
||||||
ancestor = view_parent(ancestor);
|
|
||||||
} while (ggml_is_view(ancestor));
|
|
||||||
ancestor->n_views -= 1;
|
|
||||||
if (ancestor->n_views == 0) {
|
|
||||||
ggml_backend_buffer_tensor_free(buffer->backend_buffer, ancestor);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
||||||
struct ggml_tensor * parent = node->src[j];
|
|
||||||
if (parent == NULL) {
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
if (ggml_is_view(parent)) {
|
||||||
|
struct ggml_tensor * ancestor = parent;
|
||||||
|
do {
|
||||||
|
ancestor = view_parent(ancestor);
|
||||||
|
} while (ggml_is_view(ancestor));
|
||||||
|
if (ancestor->freed) {
|
||||||
|
printf("!!!!!! tensor %s used after free (as view %s)\n", ancestor->name, parent->name);
|
||||||
|
}
|
||||||
|
allocate_node(buffer, ancestor);
|
||||||
|
}
|
||||||
|
allocate_node(buffer, parent);
|
||||||
|
}
|
||||||
|
|
||||||
|
// allocate node
|
||||||
|
allocate_node(buffer, node);
|
||||||
|
|
||||||
|
AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
|
||||||
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
|
struct ggml_tensor * parent = node->src[j];
|
||||||
|
if (parent == NULL) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
AT_PRINTF("%s", parent->name);
|
||||||
|
if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
|
||||||
|
AT_PRINTF(", ");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
AT_PRINTF("\n");
|
||||||
|
|
||||||
|
// update parents
|
||||||
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
|
struct ggml_tensor * parent = node->src[j];
|
||||||
|
if (parent == NULL) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
parent->n_children -= 1;
|
||||||
|
if (parent->n_children == 0 && parent->n_views == 0) {
|
||||||
if (ggml_is_view(parent)) {
|
if (ggml_is_view(parent)) {
|
||||||
struct ggml_tensor * ancestor = parent;
|
struct ggml_tensor * ancestor = parent;
|
||||||
do {
|
do {
|
||||||
ancestor = view_parent(ancestor);
|
ancestor = view_parent(ancestor);
|
||||||
} while (ggml_is_view(ancestor));
|
} while (ggml_is_view(ancestor));
|
||||||
ancestor->n_views -= 1;
|
ancestor->n_views -= 1;
|
||||||
if (ancestor->n_views == 0) {
|
if (ancestor->n_views == 0 && ancestor->n_children == 0) {
|
||||||
ggml_backend_buffer_tensor_free(buffer->backend_buffer, ancestor);
|
ggml_backend_buffer_tensor_free(buffer->backend_buffer, ancestor);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
parent->n_children -= 1;
|
|
||||||
if (parent->n_children == 0) {
|
|
||||||
// free parent
|
|
||||||
ggml_backend_buffer_tensor_free(buffer->backend_buffer, parent);
|
ggml_backend_buffer_tensor_free(buffer->backend_buffer, parent);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (is_view) {
|
||||||
|
struct ggml_tensor * ancestor = node;
|
||||||
|
do {
|
||||||
|
ancestor = view_parent(ancestor);
|
||||||
|
} while (ggml_is_view(ancestor));
|
||||||
|
ancestor->n_views -= 1;
|
||||||
|
if (ancestor->n_views == 0 && ancestor->n_children == 0) {
|
||||||
|
ggml_backend_buffer_tensor_free(buffer->backend_buffer, ancestor);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
AT_PRINTF("\n");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
1
ggml.c
1
ggml.c
@ -4533,6 +4533,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
|||||||
/*.node_id =*/ -1,
|
/*.node_id =*/ -1,
|
||||||
/*.n_children =*/ 0,
|
/*.n_children =*/ 0,
|
||||||
/*.n_views =*/ 0,
|
/*.n_views =*/ 0,
|
||||||
|
/*.freed =*/ false,
|
||||||
/*.perf_runs =*/ 0,
|
/*.perf_runs =*/ 0,
|
||||||
/*.perf_cycles =*/ 0,
|
/*.perf_cycles =*/ 0,
|
||||||
/*.perf_time_us =*/ 0,
|
/*.perf_time_us =*/ 0,
|
||||||
|
3
ggml.h
3
ggml.h
@ -425,6 +425,7 @@ extern "C" {
|
|||||||
int node_id; // used to build graphs
|
int node_id; // used to build graphs
|
||||||
int n_children;
|
int n_children;
|
||||||
int n_views;
|
int n_views;
|
||||||
|
bool freed; // debug
|
||||||
|
|
||||||
// performance
|
// performance
|
||||||
int perf_runs;
|
int perf_runs;
|
||||||
@ -437,7 +438,7 @@ extern "C" {
|
|||||||
|
|
||||||
void * extra; // extra things e.g. for ggml-cuda.cu
|
void * extra; // extra things e.g. for ggml-cuda.cu
|
||||||
|
|
||||||
char padding[12];
|
char padding[8];
|
||||||
};
|
};
|
||||||
|
|
||||||
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
||||||
|
@ -703,7 +703,9 @@ static bool kv_cache_init(
|
|||||||
const int64_t n_mem = n_layer*n_ctx;
|
const int64_t n_mem = n_layer*n_ctx;
|
||||||
const int64_t n_elements = n_embd*n_mem;
|
const int64_t n_elements = n_embd*n_mem;
|
||||||
|
|
||||||
size_t size = 2u*n_elements*ggml_type_size(wtype) + 2u*MB;
|
size_t size = 2u*n_elements*ggml_type_size(wtype);
|
||||||
|
|
||||||
|
fprintf(stderr, "%s: allocating %.2f MB for kv cache\n", __func__, size / 1024.0 / 1024.0);
|
||||||
|
|
||||||
cache.buf = ggml_buffer_alloc(backend, size, 2);
|
cache.buf = ggml_buffer_alloc(backend, size, 2);
|
||||||
cache.n = 0;
|
cache.n = 0;
|
||||||
|
Loading…
Reference in New Issue
Block a user