mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-04 01:57:53 +01:00
llama : update session save/load
This commit is contained in:
parent
e262947d43
commit
66aaac9867
91
llama.cpp
91
llama.cpp
@ -1563,6 +1563,8 @@ static bool llama_kv_cache_init(
|
|||||||
|
|
||||||
const int i_gpu_start = n_layer - n_gpu_layers; GGML_UNUSED(i_gpu_start);
|
const int i_gpu_start = n_layer - n_gpu_layers; GGML_UNUSED(i_gpu_start);
|
||||||
|
|
||||||
|
GGML_UNUSED(offload);
|
||||||
|
|
||||||
for (int i = 0; i < (int) n_layer; i++) {
|
for (int i = 0; i < (int) n_layer; i++) {
|
||||||
ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, wtype, n_embd*n_ctx);
|
ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, wtype, n_embd*n_ctx);
|
||||||
ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, wtype, n_embd*n_ctx);
|
ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, wtype, n_embd*n_ctx);
|
||||||
@ -5406,7 +5408,7 @@ static struct ggml_cgraph * llama_build_graph(
|
|||||||
//
|
//
|
||||||
// TODO: will be removed with backend v2
|
// TODO: will be removed with backend v2
|
||||||
|
|
||||||
#define LLAMA_OFFLOAD_DEBUG
|
//#define LLAMA_OFFLOAD_DEBUG
|
||||||
|
|
||||||
if (!do_offload) {
|
if (!do_offload) {
|
||||||
return;
|
return;
|
||||||
@ -9297,40 +9299,45 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|||||||
data_ctx->write(&kv_used, sizeof(kv_used));
|
data_ctx->write(&kv_used, sizeof(kv_used));
|
||||||
|
|
||||||
if (kv_buf_size) {
|
if (kv_buf_size) {
|
||||||
#pragma message("TODO: implement KV cache saving")
|
const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
|
||||||
#if 0
|
|
||||||
const size_t elt_size = ggml_element_size(kv_self.k);
|
|
||||||
|
|
||||||
ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
||||||
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
|
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
|
||||||
|
|
||||||
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
|
std::vector<std::vector<uint8_t>> kout2d_data(n_layer);
|
||||||
std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
|
std::vector<std::vector<uint8_t>> vout2d_data(n_layer);
|
||||||
kout3d->data = kout3d_data.data();
|
|
||||||
|
|
||||||
ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer);
|
for (int il = 0; il < (int) n_layer; ++il) {
|
||||||
std::vector<uint8_t> vout3d_data(ggml_nbytes(vout3d), 0);
|
ggml_tensor * kout2d = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
|
||||||
vout3d->data = vout3d_data.data();
|
kout2d_data[il].resize(ggml_nbytes(kout2d));
|
||||||
|
kout2d->data = kout2d_data[il].data();
|
||||||
|
|
||||||
ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
|
ggml_tensor * vout2d = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
|
||||||
n_embd, kv_head, n_layer,
|
vout2d_data[il].resize(ggml_nbytes(vout2d));
|
||||||
elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
|
vout2d->data = vout2d_data[il].data();
|
||||||
|
|
||||||
ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
|
ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
|
||||||
kv_head, n_embd, n_layer,
|
n_embd, kv_head,
|
||||||
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
|
elt_size*n_embd, 0);
|
||||||
|
|
||||||
|
ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
|
||||||
|
kv_head, n_embd,
|
||||||
|
elt_size*n_ctx, 0);
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d));
|
||||||
|
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d));
|
||||||
|
}
|
||||||
|
|
||||||
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
|
||||||
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
|
||||||
ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
|
ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
|
||||||
|
|
||||||
ggml_free(cpy_ctx);
|
ggml_free(cpy_ctx);
|
||||||
|
|
||||||
// our data is now in the kout3d_data and vout3d_data buffers
|
// our data is now in the kout2d_data and vout2d_data buffers
|
||||||
// write them to file
|
// write them to file
|
||||||
data_ctx->write(kout3d_data.data(), kout3d_data.size());
|
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||||
data_ctx->write(vout3d_data.data(), vout3d_data.size());
|
data_ctx->write(kout2d_data[il].data(), kout2d_data[il].size());
|
||||||
#endif
|
data_ctx->write(vout2d_data[il].data(), vout2d_data[il].size());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (uint32_t i = 0; i < kv_size; ++i) {
|
for (uint32_t i = 0; i < kv_size; ++i) {
|
||||||
@ -9430,35 +9437,35 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|||||||
if (kv_buf_size) {
|
if (kv_buf_size) {
|
||||||
GGML_ASSERT(kv_self.buf.size == kv_buf_size);
|
GGML_ASSERT(kv_self.buf.size == kv_buf_size);
|
||||||
|
|
||||||
#pragma message("TODO: implement KV cache loading")
|
const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
|
||||||
#if 0
|
|
||||||
const size_t elt_size = ggml_element_size(kv_self.k);
|
|
||||||
|
|
||||||
ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
||||||
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
|
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
|
||||||
|
|
||||||
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
kin3d->data = (void *) inp;
|
ggml_tensor * kin2d = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
|
||||||
inp += ggml_nbytes(kin3d);
|
kin2d->data = (void *) inp;
|
||||||
|
inp += ggml_nbytes(kin2d);
|
||||||
|
|
||||||
ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer);
|
ggml_tensor * vin2d = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
|
||||||
vin3d->data = (void *) inp;
|
vin2d->data = (void *) inp;
|
||||||
inp += ggml_nbytes(vin3d);
|
inp += ggml_nbytes(vin2d);
|
||||||
|
|
||||||
ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
|
ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
|
||||||
n_embd, kv_head, n_layer,
|
n_embd, kv_head,
|
||||||
elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
|
elt_size*n_embd, 0);
|
||||||
|
|
||||||
ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
|
ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
|
||||||
kv_head, n_embd, n_layer,
|
kv_head, n_embd,
|
||||||
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
|
elt_size*n_ctx, 0);
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d, k2d));
|
||||||
|
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d, v2d));
|
||||||
|
}
|
||||||
|
|
||||||
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
|
||||||
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
|
||||||
ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
|
ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
|
||||||
|
|
||||||
ggml_free(cpy_ctx);
|
ggml_free(cpy_ctx);
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx->kv_self.head = kv_head;
|
ctx->kv_self.head = kv_head;
|
||||||
|
2
llama.h
2
llama.h
@ -42,7 +42,7 @@
|
|||||||
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
||||||
|
|
||||||
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||||
#define LLAMA_SESSION_VERSION 2
|
#define LLAMA_SESSION_VERSION 3
|
||||||
|
|
||||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
||||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||||
|
Loading…
Reference in New Issue
Block a user