mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-24 10:29:21 +01:00
llama : optimize defrag moves + fix fragmentation calculation (#6037)
* attempt to reduce the impact of a worst-case scenario * fragmentation calculation fix * Update llama.cpp --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
3ca23481dd
commit
2c4fb69246
30
llama.cpp
30
llama.cpp
@ -9036,8 +9036,8 @@ static int llama_decode_internal(
|
|||||||
//llama_synchronize(&lctx);
|
//llama_synchronize(&lctx);
|
||||||
|
|
||||||
// decide if we need to defrag the kv cache
|
// decide if we need to defrag the kv cache
|
||||||
if (cparams.defrag_thold >= 0.0f) {
|
if (cparams.causal_attn && cparams.defrag_thold >= 0.0f) {
|
||||||
const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used + n_tokens_all)/float(kv_self.n) : 0.0f;
|
const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used)/float(kv_self.n) : 0.0f;
|
||||||
|
|
||||||
// queue defragmentation for next llama_kv_cache_update
|
// queue defragmentation for next llama_kv_cache_update
|
||||||
if (fragmentation > cparams.defrag_thold) {
|
if (fragmentation > cparams.defrag_thold) {
|
||||||
@ -9069,6 +9069,11 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
|||||||
// number of cells moved
|
// number of cells moved
|
||||||
uint32_t n_moves = 0;
|
uint32_t n_moves = 0;
|
||||||
|
|
||||||
|
// each move requires 6*n_layer tensors (see build_defrag)
|
||||||
|
// - source view, destination view, copy operation
|
||||||
|
// - x2 for keys and values
|
||||||
|
const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
|
||||||
|
|
||||||
// determine which KV cells to move where
|
// determine which KV cells to move where
|
||||||
//
|
//
|
||||||
// cell i moves to ids[i]
|
// cell i moves to ids[i]
|
||||||
@ -9095,15 +9100,6 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
|||||||
nh++;
|
nh++;
|
||||||
}
|
}
|
||||||
|
|
||||||
// each move requires 6*n_layer tensors (see build_defrag)
|
|
||||||
// - source view, destination view, copy operation
|
|
||||||
// - x2 for keys and values
|
|
||||||
//
|
|
||||||
if (6*(n_moves + nh)*n_layer >= LLAMA_MAX_NODES) {
|
|
||||||
// the graph is too big, we cannot move more cells
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
uint32_t nf = 0;
|
uint32_t nf = 0;
|
||||||
uint32_t is = n_kv - 1;
|
uint32_t is = n_kv - 1;
|
||||||
|
|
||||||
@ -9133,11 +9129,19 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
|||||||
// are we moving a continuous block of memory?
|
// are we moving a continuous block of memory?
|
||||||
bool cont = false;
|
bool cont = false;
|
||||||
|
|
||||||
|
// should we stop searching for the next move?
|
||||||
|
bool stop = false;
|
||||||
|
|
||||||
// go back and move the nf cells to the hole
|
// go back and move the nf cells to the hole
|
||||||
for (; i1 < n_kv; ++i1) {
|
for (; i1 < n_kv; ++i1) {
|
||||||
auto & cell1 = kv_self.cells[i1];
|
auto & cell1 = kv_self.cells[i1];
|
||||||
|
|
||||||
if (cell1.is_empty() || ids[i1] != n_kv) {
|
if (cell1.is_empty() || ids[i1] != n_kv) {
|
||||||
|
if (n_moves == max_moves) {
|
||||||
|
stop = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
cont = false;
|
cont = false;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -9164,6 +9168,10 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (stop || n_moves == max_moves) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
//LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
|
//LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
|
||||||
|
|
||||||
i0 += nh - 1;
|
i0 += nh - 1;
|
||||||
|
Loading…
Reference in New Issue
Block a user