llama : correctly handle more edge cases for the rs cache

This commit is contained in:
Francis Couture-Harpin 2024-04-09 17:35:22 -04:00
parent 0028010d01
commit 0c8b3b2095

407
llama.cpp
View File

@ -2034,7 +2034,7 @@ struct llama_rs_cache {
uint32_t n = 0; // range of states used for the last slot
// useful to know the minimum reserved cell count per seq_id
// only counts sequences with n_cells > 0 AND which have a non-shared tail
// only counts sequences which have a non-shared tail
uint32_t n_seqs = 0;
// cells part of multiple sequences AND which have at least one tail
uint32_t n_shared_tail_cells = 0;
@ -2082,21 +2082,37 @@ struct llama_rs_cache {
llama_rs_cell & cell = cells[cell_id];
if (cell.seq_nodes.empty()) {
if (cell.pos >= 0) {
if (debug) {
LLAMA_LOG_ERROR("%s: cells[%d].pos is %d while it's empty (should be -1)\n",
__func__, cell_id, cell.pos);
}
cell.pos = -1;
was_valid = false;
}
}
if (cell.pos < 0) {
if (cell.pos != -1) {
if (debug) {
LLAMA_LOG_ERROR("%s: cells[%d].pos is %d while it's empty (should be -1)\n",
__func__, cell_id, cell.pos);
}
cell.pos = -1;
was_valid = false;
}
if (!cell.seq_nodes.empty()) {
if (debug) {
LLAMA_LOG_ERROR("%s: cells[%d] has %zu seq_ids while it's empty (should have none)\n",
__func__, cell_id, cell.seq_nodes.size());
}
cell.seq_nodes.clear();
was_valid = false;
}
cell.src = -1;
if (cell.prev != -1) {
if (debug) {
LLAMA_LOG_ERROR("%s: cells[%d].prev is %d while it's empty (should be -1)\n",
__func__, cell_id, cell.prev);
}
cell.prev = -1;
was_valid = false;
}
@ -2213,17 +2229,15 @@ struct llama_rs_cache {
// n_seqs
uint32_t n_seqs_verif = 0;
uint32_t n_shared_tail_cells_verif = 0;
for (llama_seq_id seq_id = 0; (uint32_t) seq_id < size; ++seq_id) {
auto & seq = seq_tails[seq_id];
if (seq.tail >= 0) {
llama_rs_cell & tail_cell = cells[seq.tail];
// NOTE: could also have checked if n_cells > 0
if (!tail_cell.seq_nodes.empty() && tail_cell.seq_nodes[0].seq_id == seq_id) {
if (tail_cell.seq_nodes.size() > 1) {
n_shared_tail_cells_verif += 1;
} else {
for (uint32_t cell_id = 0; (uint32_t) cell_id < size; ++cell_id) {
llama_rs_cell & rs_cell = cells[cell_id];
if (!rs_cell.seq_nodes.empty()) {
if (rs_cell.seq_nodes.size() == 1) {
if (rs_cell.tail_rc == 1) {
n_seqs_verif += 1;
}
} else if (rs_cell.tail_rc > 0) {
n_shared_tail_cells_verif += 1;
}
}
}
@ -2246,72 +2260,15 @@ struct llama_rs_cache {
return was_valid;
}
// returns whether or not a cell was freed
void clear_cell(llama_rs_cell & rs_cell) {
GGML_ASSERT(&rs_cell >= cells.data() && &rs_cell < cells.data() + cells.size());
if (!rs_cell.is_empty()) {
// update sequence tree links
bool first = true;
for (const llama_rs_seq_node & node : rs_cell.seq_nodes) {
if (node.next_cell >= 0 && (uint32_t) node.next_cell < size) {
// NOTE: if all next cells are the same cell, this should still work
cells[node.next_cell].prev = rs_cell.prev;
}
// next_cell of the nodes of the previous cell
if (rs_cell.prev >= 0 && (uint32_t) rs_cell.prev < size) {
llama_rs_cell & prev_cell = cells[rs_cell.prev];
auto prev_node = std::find(prev_cell.seq_nodes.begin(), prev_cell.seq_nodes.end(), node);
// assuming the previous node is always found
GGML_ASSERT(prev_node != prev_cell.seq_nodes.end());
prev_node->next_cell = node.next_cell;
if (node.is_tail()) {
prev_cell.tail_rc += 1;
}
}
if ((uint32_t) node.seq_id < seq_tails.size()) {
auto & seq = seq_tails[node.seq_id];
// update tail
if (node.is_tail()) {
seq.tail = rs_cell.prev;
}
// cell counts
if (first) {
seq.n_cells -= 1;
if (rs_cell.tail_rc > 0 && seq.tail < 0) {
// last tail cell
if (rs_cell.seq_nodes.size() > 1) {
n_shared_tail_cells -= 1;
} else {
n_seqs -= 1;
}
}
first = false;
}
} else {
GGML_ASSERT(false && "invalid seq_id");
}
}
rs_cell.pos = -1;
rs_cell.src = -1;
rs_cell.prev = -1;
rs_cell.tail_rc = 0;
rs_cell.seq_nodes.clear();
used -= 1;
}
}
// returns an iterator to the seq_node after the removed one, or the same which was passed if it wasn't removed.
std::vector<llama_rs_seq_node>::iterator remove_seq_node_from_cell(llama_rs_cell & rs_cell, std::vector<llama_rs_seq_node>::iterator node_iter) {
GGML_ASSERT(&rs_cell >= cells.data() && &rs_cell < cells.data() + cells.size());
// TODO: assert the iterator points inside the correct vector
if (node_iter != rs_cell.seq_nodes.end()) {
if (rs_cell.seq_nodes.size() == 1) {
clear_cell(rs_cell);
return rs_cell.seq_nodes.end();
}
// else update tree
// update the tree
llama_rs_seq_node node = *node_iter;
if (node.next_cell >= 0 && (uint32_t) node.next_cell < size) {
// NOTE: because of this, partially removing seq_ids from cells should only be done from the tail
cells[node.next_cell].prev = rs_cell.prev;
}
if (rs_cell.prev >= 0 && (uint32_t) rs_cell.prev < size) {
@ -2321,6 +2278,14 @@ struct llama_rs_cache {
GGML_ASSERT(prev_node != prev_cell.seq_nodes.end());
prev_node->next_cell = node.next_cell;
if (node.is_tail()) {
if (prev_cell.seq_nodes.size() > 1) {
if (prev_cell.tail_rc == 0) {
n_shared_tail_cells += 1;
}
if (rs_cell.seq_nodes.size() == 1) {
n_seqs -= 1;
}
}
prev_cell.tail_rc += 1;
}
}
@ -2328,11 +2293,15 @@ struct llama_rs_cache {
auto & seq = seq_tails[node.seq_id];
if (node.is_tail()) {
seq.tail = rs_cell.prev;
if (seq.tail < 0 && rs_cell.tail_rc == 1) {
// assuming the previous cell of a shared cell is also shared,
// (no need to update the shared tail cells count elsewhere, then)
// this was a shared tail cell, but will no longer be a tail cell
n_shared_tail_cells -= 1;
if (rs_cell.tail_rc == 1) {
if (rs_cell.seq_nodes.size() > 1) {
// assuming the previous cell of a shared cell is also shared,
// this was a shared tail cell, but will no longer be a tail cell
n_shared_tail_cells -= 1;
} else if (seq.tail < 0) {
// no more tail, no more sequence
n_seqs -= 1;
}
}
GGML_ASSERT(rs_cell.tail_rc > 0);
rs_cell.tail_rc -= 1;
@ -2341,21 +2310,30 @@ struct llama_rs_cache {
// this seq_id was the first in the list
seq.n_cells -= 1;
// the next node is the new first one, so update its n_cells
// (will never be out-of-bounds because the size is > 1)
llama_rs_seq_node next_node = *(std::next(node_iter));
if ((uint32_t) next_node.seq_id < seq_tails.size()) {
auto & next_seq = seq_tails[next_node.seq_id];
next_seq.n_cells += 1;
// only the tail ref count from the other seq_ids are left in tail_rc
if (rs_cell.tail_rc > 0) {
// will become a non-shared cell
if (rs_cell.seq_nodes.size() == 2) {
n_seqs += 1;
auto next_node = std::next(node_iter);
if (next_node != rs_cell.seq_nodes.end()) {
// the next node is the new first one, so update its n_cells
if ((uint32_t) next_node->seq_id < seq_tails.size()) {
auto & next_seq = seq_tails[next_node->seq_id];
next_seq.n_cells += 1;
// only the tail ref count from the other seq_ids are left in tail_rc
if (rs_cell.tail_rc > 0) {
// will become a non-shared cell
if (rs_cell.seq_nodes.size() == 2) {
n_shared_tail_cells -= 1;
n_seqs += 1;
}
}
} else {
GGML_ASSERT(false && "invalid seq_id");
}
} else {
GGML_ASSERT(false && "invalid seq_id");
// this was the last seq_id of the cell
used -= 1;
rs_cell.pos = -1;
rs_cell.src = -1;
rs_cell.prev = -1;
// the other fields *should* have already been updated elsewhere
}
}
} else {
@ -2366,6 +2344,13 @@ struct llama_rs_cache {
return node_iter;
}
void clear_cell(llama_rs_cell & rs_cell) {
GGML_ASSERT(&rs_cell >= cells.data() && &rs_cell < cells.data() + cells.size());
for (auto node_iter = rs_cell.seq_nodes.begin(); node_iter != rs_cell.seq_nodes.end();) {
node_iter = remove_seq_node_from_cell(rs_cell, node_iter);
}
}
// returns whether or not the seq_id was removed
bool remove_seq_from_cell_id(uint32_t i_cell, const llama_seq_id & id) {
if (i_cell < size && (size_t) id < size) {
@ -2404,47 +2389,63 @@ struct llama_rs_cache {
prev_cell.tail_rc -= 1;
prev_node->next_cell = i_cell;
rs_cell.prev = prev;
if (seq.tail == prev) {
// What to do when the tail moves...
// from unique to shared (n_seqs--)
// if the new cell has one seq_id or has no tails (n_shared_tail_cells++)
// if the new cell has one seq_id and a tail (n_seqs-- (yes, another time))
// from unique to unique (seq.n_cells++)
// from empty to unique (seq.n_cells++, n_seqs++)
// from empty to shared
// if the new cell only has one seq_id or has no tail (n_shared_tail_cells++)
// if the new cell only has one seq_id and has one tail (n_seqs--)
// from shared to shared
// if the last cell has no tails (n_shared_tail_cells--)
// if the new cell has no tails or has one seq_id (n_shared_tail_cells++)
// if the new cell only has one seq_id and has one tail (n_seqs--)
// from shared to unique (seq.n_cells++)
// if this seq_id was not the first of the last cell (n_seqs++)
// if the last cell has no tails (n_shared_tail_cells--)
if (prev_cell.seq_nodes.size() > 1) {
// from shared
if (rs_cell.is_empty()) {
// to unique
if (prev_cell.seq_nodes[0].seq_id != id) {
n_seqs += 1;
}
}
// the previous cell is no longer a shared tail
if (prev_cell.tail_rc == 0) {
n_shared_tail_cells -= 1;
}
} else if (!rs_cell.is_empty()) {
// from unique to shared
n_seqs -= 1;
}
}
}
if (rs_cell.is_empty()) {
// either the sequence didn't own any cells or had a shared tail cell
if (seq.n_cells == 0 || (seq.tail >= 0 && cells[seq.tail].seq_nodes.size() > 1)) {
n_seqs += 1;
}
// to unique
seq.n_cells += 1;
// set pos if still unset
if (rs_cell.pos < 0) {
if (seq.tail < 0) {
// from empty to unique
n_seqs += 1;
// pos was not yet set
rs_cell.pos = 0;
rs_cell.src = -1;
}
used += 1;
} else if (rs_cell.seq_nodes.size() == 1 && rs_cell.tail_rc == 1) {
// don't count shared-cell tails
// FIXME: make this saner
n_seqs -= 1;
n_shared_tail_cells += 1;
} else if (rs_cell.tail_rc == 0) {
// shared cell without a tail gets a tail;
// FIXME: don't prune, in case this is used in llama_cache_seq_cp
GGML_ASSERT(false); // make sure we don't get here by accident
// prune the other sequences out of this cell
// NOTE: have to inline the removal because the state tree is partially invalid
bool first = true;
for (auto & node : rs_cell.seq_nodes) {
GGML_ASSERT(node.seq_id != id);
GGML_ASSERT(node.next_cell >= 0);
// easy removal, none of the nodes are tails
llama_rs_cell & next_cell = cells[node.next_cell];
next_cell.prev = rs_cell.prev;
if (first) {
auto & first_seq = seq_tails[node.seq_id];
first_seq.n_cells -= 1;
first = false;
} else {
// to shared
if (rs_cell.seq_nodes.size() == 1) {
// a lone tail becomes a shared cell
if (rs_cell.tail_rc > 0) {
n_seqs -= 1;
}
n_shared_tail_cells += 1;
} else if (rs_cell.tail_rc == 0) {
n_shared_tail_cells += 1;
}
rs_cell.seq_nodes.clear();
} else if (rs_cell.seq_nodes.size() != rs_cell.tail_rc) {
// this is correct as long as this isn't called when trying to find a slot
// TODO: find a way to assert this
}
// the target cell was not already a tail of this seq_id
rs_cell.insert_node(id); // next_cell == -1 by default
@ -2977,6 +2978,7 @@ static bool llama_kv_cache_find_slot(
llama_rs_cell & candidate = cache.rs.cells[cell_id];
if (candidate.is_empty()) { break; }
if (candidate.tail_rc == 1 && seq.tail == (int32_t) cell_id) {
// the candidate is the old tail
if (candidate.seq_nodes.size() > 1) {
// prune out the other seq_ids, because they diverge
// TODO(maybe): hande this in insert_seq_tail_to_cell_id
@ -3198,40 +3200,42 @@ static llama_pos llama_cache_seq_rm(
llama_pos new_p0 = 0;
llama_pos new_p1 = std::numeric_limits<llama_pos>::max();
for (uint32_t i = 0; i < cache.rs.size; ++i) {
llama_rs_cell & rs_cell = cache.rs.cells[i];
auto seq_node = std::find(rs_cell.seq_nodes.begin(), rs_cell.seq_nodes.end(), seq_id);
// partial seq_id removal has to happen from the tail
llama_rs_seq_meta & seq = cache.rs.seq_tails[seq_id];
int32_t cell_id = seq.tail;
while (cell_id >= 0) {
llama_rs_cell & rs_cell = cache.rs.cells[cell_id];
// copy before the cell is potentially changed
int32_t prev_id = rs_cell.prev;
if (rs_cell.pos >= p1 && rs_cell.seq_nodes.size() > 1) {
// non-tail removal for shared cells can only be done when clearing a cell
// (i.e. when the next cell's link to the previous cell can be safely changed)
p1 = rs_cell.pos + 1;
}
if (rs_cell.pos >= p0 && rs_cell.pos < p1) {
auto node_iter = std::find(rs_cell.seq_nodes.begin(), rs_cell.seq_nodes.end(), seq_id);
// if the node isn't found, the sequence tree is malformed
GGML_ASSERT(node_iter != rs_cell.seq_nodes.end());
cache.rs.remove_seq_node_from_cell(rs_cell, node_iter);
// get the smallest removed cell id
if (new_head > (uint32_t) cell_id) { new_head = cell_id; }
} else {
// one more than the biggest non-removed cell of this sequence
if (rs_cell.pos >= n_past) { n_past = rs_cell.pos + 1; }
if (seq_id < 0 || seq_node != rs_cell.seq_nodes.end()) {
if (rs_cell.pos < p0) {
// move forward the new p0 further
if (rs_cell.pos >= new_p0) {
new_p0 = rs_cell.pos + 1;
}
} else if (rs_cell.pos >= p1) {
// move back the new p1 further
if (rs_cell.pos < new_p1) {
new_p1 = rs_cell.pos;
}
if (rs_cell.pos >= n_past) {
n_past = rs_cell.pos + 1;
}
} else { // (rs_cell.pos >= p0 && rs_cell.pos < p1)
if (seq_id < 0) {
cache.rs.clear_cell(rs_cell);
} else { // (rs_cell.has_seq_id(seq_id))
cache.rs.remove_seq_node_from_cell(rs_cell, seq_node);
}
if (rs_cell.is_empty() && new_head == cache.rs.size) {
new_head = i;
}
// new_p0 should be right after the max pos in the states before p0
if (rs_cell.pos >= new_p0) { new_p0 = rs_cell.pos + 1; }
} else { // (rs_cell.pos >= p1)
// new_p1 should be the min pos in the states after p1
if (rs_cell.pos < new_p1) { new_p1 = rs_cell.pos; }
}
}
cell_id = prev_id;
}
p0 = new_p0;
p1 = new_p1;
// correctly set n_past when there's nothing after p1
if (n_past < p0) { n_past = p0; }
// If we freed up a slot, set head to it so searching can start there.
if (new_head != cache.rs.size && new_head < cache.rs.head) {
@ -3259,10 +3263,8 @@ static llama_pos llama_cache_seq_rm(
kv_cell.pos = -1;
if (new_head == cache.kv.size) { new_head = i; }
}
} else {
if (kv_cell.pos >= n_past) {
n_past = kv_cell.pos + 1;
}
} else if (kv_cell.pos >= n_past) {
n_past = kv_cell.pos + 1;
}
}
}
@ -3292,42 +3294,37 @@ static llama_pos llama_cache_seq_cp(
llama_pos n_past = 0;
if (cache.rs.size > 0) {
// have to start from beginning for recurrent models
// have to start from the beginning for recurrent models
p0 = 0;
if ((uint32_t) seq_id_dst < cache.rs.size && (uint32_t) seq_id_src < cache.rs.size) {
auto seq_src = cache.rs.seq_tails[seq_id_src];
int32_t src_tail = seq_src.tail;
// find the last tail of src in the pos range
while (src_tail >= 0 && (uint32_t) src_tail < cache.rs.size) {
llama_rs_cell & tail_cell = cache.rs.cells[src_tail];
if (tail_cell.pos < p1) {
break;
}
src_tail = tail_cell.prev;
}
uint32_t new_head = cache.rs.size;
int32_t src_head = -1;
int32_t head_pos = p1;
int32_t src_next = -1;
// find the start of the sequence
for (uint32_t i = 0; i < cache.rs.size; ++i) {
llama_rs_cell & rs_cell = cache.rs.cells[i];
if (rs_cell.pos >= p0 && rs_cell.pos < p1 && rs_cell.has_seq_id(seq_id_src)) {
if (i == (uint32_t) src_tail) {
// need to be inserted in order, but there's only one
cache.rs.insert_seq_tail_to_cell_id(i, seq_id_dst);
} else {
// keep only the tail cell of the source
// assuming a copy means no rollback will be attempted afterwards
cache.rs.remove_seq_from_cell_id(i, seq_id_src);
if (new_head == cache.rs.size) {
new_head = i;
}
if (!rs_cell.is_empty() && rs_cell.prev < 0) {
auto seq_node = std::find(rs_cell.seq_nodes.begin(), rs_cell.seq_nodes.end(), seq_id_src);
if (seq_node != rs_cell.seq_nodes.end()) {
src_head = i;
head_pos = rs_cell.pos;
src_next = seq_node->next_cell;
break;
}
}
}
// If we freed up a slot, set head to it so searching can start there.
if (new_head != cache.rs.size && new_head < cache.rs.head) {
cache.rs.head = new_head;
while (src_head >= 0 && head_pos < p1) {
cache.rs.insert_seq_tail_to_cell_id(src_head, seq_id_dst);
src_head = src_next;
if (head_pos >= n_past) { n_past = head_pos + 1; }
if (src_next >= 0) {
llama_rs_cell & rs_cell = cache.rs.cells[src_next];
auto seq_node = std::find(rs_cell.seq_nodes.begin(), rs_cell.seq_nodes.end(), seq_id_src);
head_pos = rs_cell.pos;
// it should always be found if the seq tree is valid
GGML_ASSERT(seq_node != rs_cell.seq_nodes.end());
src_next = seq_node->next_cell;
}
}
}
p1 = n_past;
@ -3338,9 +3335,7 @@ static llama_pos llama_cache_seq_cp(
llama_kv_cell & kv_cell = cache.kv.cells[i];
if (kv_cell.pos >= p0 && kv_cell.pos < p1 && kv_cell.has_seq_id(seq_id_src)) {
kv_cell.seq_id.insert(seq_id_dst);
if (kv_cell.pos >= n_past) {
n_past = kv_cell.pos + 1;
}
if (kv_cell.pos >= n_past) { n_past = kv_cell.pos + 1; }
}
}
}
@ -3352,18 +3347,19 @@ static void llama_cache_seq_keep(struct llama_cache & cache, llama_seq_id seq_id
if (cache.rs.size > 0) {
uint32_t new_head = cache.rs.size;
for (uint32_t i = 0; i < cache.rs.size; ++i) {
llama_rs_cell & rs_cell = cache.rs.cells[i];
if (!rs_cell.seq_nodes.empty()) {
for (auto node_iter = rs_cell.seq_nodes.begin(); node_iter != rs_cell.seq_nodes.end();) {
if (node_iter->seq_id != seq_id) {
node_iter = cache.rs.remove_seq_node_from_cell(rs_cell, node_iter);
} else {
node_iter = std::next(node_iter);
}
}
if (new_head == cache.rs.size && rs_cell.is_empty()) {
new_head = i;
// partial seq_id removal has to happen from the tail(s)
for (uint32_t i = 0; i < cache.rs.seq_tails.size(); ++i) {
if (i == (uint32_t) seq_id) { continue; }
llama_rs_seq_meta & seq = cache.rs.seq_tails[i];
int32_t cell_id = seq.tail;
while (cell_id >= 0) {
llama_rs_cell & rs_cell = cache.rs.cells[cell_id];
auto node_iter = std::find(rs_cell.seq_nodes.begin(), rs_cell.seq_nodes.end(), i);
GGML_ASSERT(node_iter != rs_cell.seq_nodes.end());
cache.rs.remove_seq_node_from_cell(rs_cell, node_iter);
cell_id = rs_cell.prev;
if (new_head > (uint32_t) cell_id && rs_cell.is_empty()) {
new_head = cell_id;
}
}
}
@ -3414,6 +3410,7 @@ static llama_pos llama_cache_seq_add(
auto & seq = cache.rs.seq_tails[seq_id];
// follow the sequence from its tail
int32_t cell_id = seq.tail;
uint32_t new_head = cache.rs.size;
while (cell_id >= 0) {
GGML_ASSERT((uint32_t) cell_id < cache.rs.size);
llama_rs_cell & rs_cell = cache.rs.cells[cell_id];
@ -3423,13 +3420,19 @@ static llama_pos llama_cache_seq_add(
if (rs_cell.pos < 0) {
// NOTE: this affects the other sequences which share the cell
cache.rs.clear_cell(rs_cell);
// TODO: update cache.rs.head
if (new_head > (uint32_t) cell_id) {
new_head = cell_id;
}
}
}
if (n_past <= rs_cell.pos) {
n_past = rs_cell.pos + 1;
}
}
// If we freed up a slot, set head to it so searching can start there.
// Otherwise we just start the next search from the beginning.
cache.rs.head = new_head != cache.rs.size ? new_head : 0;
}
if (cache.kv.size > 0) {
@ -3474,8 +3477,8 @@ static llama_pos llama_cache_seq_div(
llama_pos p0,
llama_pos p1,
int d) {
if (p0 < 0) p0 = 0;
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
if (p0 < 0) { p0 = 0; }
if (p1 < 0) { p1 = std::numeric_limits<llama_pos>::max(); }
llama_pos n_past = p0;
@ -11275,6 +11278,10 @@ static int llama_decode_internal(
}
}
n_outputs_prev += lctx.n_outputs;
#ifndef NDEBUG
GGML_ASSERT(lctx.cache.rs.rebuild(true));
#endif
}
// wait for the computation to finish (automatically done when obtaining the model output)
@ -16332,11 +16339,19 @@ void llama_batch_free(struct llama_batch batch) {
int32_t llama_decode(
struct llama_context * ctx,
struct llama_batch batch) {
#ifndef NDEBUG
GGML_ASSERT(ctx->cache.rs.rebuild(true));
#endif
const int ret = llama_decode_internal(*ctx, batch);
if (ret < 0) {
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
}
#ifndef NDEBUG
GGML_ASSERT(ctx->cache.rs.rebuild(true));
#endif
return ret;
}