diff --git a/llama.cpp b/llama.cpp
index d561f80f6..5433bde86 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2034,7 +2034,7 @@ struct llama_rs_cache {
     uint32_t n = 0; // range of states used for the last slot
 
     // useful to know the minimum reserved cell count per seq_id
-    // only counts sequences with n_cells > 0 AND which have a non-shared tail
+    // only counts sequences which have a non-shared tail
     uint32_t n_seqs = 0;
     // cells part of multiple sequences AND which have at least one tail
     uint32_t n_shared_tail_cells = 0;
@@ -2082,21 +2082,37 @@ struct llama_rs_cache {
             llama_rs_cell & cell = cells[cell_id];
             if (cell.seq_nodes.empty()) {
                 if (cell.pos >= 0) {
+                    if (debug) {
+                        LLAMA_LOG_ERROR("%s: cells[%d].pos is %d while it's empty (should be -1)\n",
+                            __func__, cell_id, cell.pos);
+                    }
                     cell.pos = -1;
                     was_valid = false;
                 }
             }
             if (cell.pos < 0) {
                 if (cell.pos != -1) {
+                    if (debug) {
+                        LLAMA_LOG_ERROR("%s: cells[%d].pos is %d while it's empty (should be -1)\n",
+                            __func__, cell_id, cell.pos);
+                    }
                     cell.pos = -1;
                     was_valid = false;
                 }
                 if (!cell.seq_nodes.empty()) {
+                    if (debug) {
+                        LLAMA_LOG_ERROR("%s: cells[%d] has %zu seq_ids while it's empty (should have none)\n",
+                            __func__, cell_id, cell.seq_nodes.size());
+                    }
                     cell.seq_nodes.clear();
                     was_valid = false;
                 }
                 cell.src = -1;
                 if (cell.prev != -1) {
+                    if (debug) {
+                        LLAMA_LOG_ERROR("%s: cells[%d].prev is %d while it's empty (should be -1)\n",
+                            __func__, cell_id, cell.prev);
+                    }
                     cell.prev = -1;
                     was_valid = false;
                 }
@@ -2213,17 +2229,15 @@ struct llama_rs_cache {
         // n_seqs
         uint32_t n_seqs_verif = 0;
         uint32_t n_shared_tail_cells_verif = 0;
-        for (llama_seq_id seq_id = 0; (uint32_t) seq_id < size; ++seq_id) {
-            auto & seq = seq_tails[seq_id];
-            if (seq.tail >= 0) {
-                llama_rs_cell & tail_cell = cells[seq.tail];
-                // NOTE: could also have checked if n_cells > 0
-                if (!tail_cell.seq_nodes.empty() && tail_cell.seq_nodes[0].seq_id == seq_id) {
-                    if (tail_cell.seq_nodes.size() > 1) {
-                        n_shared_tail_cells_verif += 1;
-                    } else {
+        for (uint32_t cell_id = 0; (uint32_t) cell_id < size; ++cell_id) {
+            llama_rs_cell & rs_cell = cells[cell_id];
+            if (!rs_cell.seq_nodes.empty()) {
+                if (rs_cell.seq_nodes.size() == 1) {
+                    if (rs_cell.tail_rc == 1) {
                         n_seqs_verif += 1;
                     }
+                } else if (rs_cell.tail_rc > 0) {
+                    n_shared_tail_cells_verif += 1;
                 }
             }
         }
@@ -2246,72 +2260,15 @@ struct llama_rs_cache {
         return was_valid;
     }
 
-    // returns whether or not a cell was freed
-    void clear_cell(llama_rs_cell & rs_cell) {
-        GGML_ASSERT(&rs_cell >= cells.data() && &rs_cell < cells.data() + cells.size());
-        if (!rs_cell.is_empty()) {
-            // update sequence tree links
-            bool first = true;
-            for (const llama_rs_seq_node & node : rs_cell.seq_nodes) {
-                if (node.next_cell >= 0 && (uint32_t) node.next_cell < size) {
-                    // NOTE: if all next cells are the same cell, this should still work
-                    cells[node.next_cell].prev = rs_cell.prev;
-                }
-                // next_cell of the nodes of the previous cell
-                if (rs_cell.prev >= 0 && (uint32_t) rs_cell.prev < size) {
-                    llama_rs_cell & prev_cell = cells[rs_cell.prev];
-                    auto prev_node = std::find(prev_cell.seq_nodes.begin(), prev_cell.seq_nodes.end(), node);
-                    // assuming the previous node is always found
-                    GGML_ASSERT(prev_node != prev_cell.seq_nodes.end());
-                    prev_node->next_cell = node.next_cell;
-                    if (node.is_tail()) {
-                        prev_cell.tail_rc += 1;
-                    }
-                }
-                if ((uint32_t) node.seq_id < seq_tails.size()) {
-                    auto & seq = seq_tails[node.seq_id];
-                    // update tail
-                    if (node.is_tail()) {
-                        seq.tail = rs_cell.prev;
-                    }
-                    // cell counts
-                    if (first) {
-                        seq.n_cells -= 1;
-                        if (rs_cell.tail_rc > 0 && seq.tail < 0) {
-                            // last tail cell
-                            if (rs_cell.seq_nodes.size() > 1) {
-                                n_shared_tail_cells -= 1;
-                            } else {
-                                n_seqs -= 1;
-                            }
-                        }
-                        first = false;
-                    }
-                } else {
-                    GGML_ASSERT(false && "invalid seq_id");
-                }
-            }
-            rs_cell.pos  = -1;
-            rs_cell.src  = -1;
-            rs_cell.prev = -1;
-            rs_cell.tail_rc = 0;
-            rs_cell.seq_nodes.clear();
-            used -= 1;
-        }
-    }
-
     // returns an iterator to the seq_node after the removed one, or the same which was passed if it wasn't removed.
     std::vector<llama_rs_seq_node>::iterator remove_seq_node_from_cell(llama_rs_cell & rs_cell, std::vector<llama_rs_seq_node>::iterator node_iter) {
         GGML_ASSERT(&rs_cell >= cells.data() && &rs_cell < cells.data() + cells.size());
         // TODO: assert the iterator points inside the correct vector
         if (node_iter != rs_cell.seq_nodes.end()) {
-            if (rs_cell.seq_nodes.size() == 1) {
-                clear_cell(rs_cell);
-                return rs_cell.seq_nodes.end();
-            }
-            // else update tree
+            // update the tree
             llama_rs_seq_node node = *node_iter;
             if (node.next_cell >= 0 && (uint32_t) node.next_cell < size) {
+                // NOTE: because of this, partially removing seq_ids from cells should only be done from the tail
                 cells[node.next_cell].prev = rs_cell.prev;
             }
             if (rs_cell.prev >= 0 && (uint32_t) rs_cell.prev < size) {
@@ -2321,6 +2278,14 @@ struct llama_rs_cache {
                 GGML_ASSERT(prev_node != prev_cell.seq_nodes.end());
                 prev_node->next_cell = node.next_cell;
                 if (node.is_tail()) {
+                    if (prev_cell.seq_nodes.size() > 1) {
+                        if (prev_cell.tail_rc == 0) {
+                            n_shared_tail_cells += 1;
+                        }
+                        if (rs_cell.seq_nodes.size() == 1) {
+                            n_seqs -= 1;
+                        }
+                    }
                     prev_cell.tail_rc += 1;
                 }
             }
@@ -2328,11 +2293,15 @@ struct llama_rs_cache {
                 auto & seq = seq_tails[node.seq_id];
                 if (node.is_tail()) {
                     seq.tail = rs_cell.prev;
-                    if (seq.tail < 0 && rs_cell.tail_rc == 1) {
-                        // assuming the previous cell of a shared cell is also shared,
-                        // (no need to update the shared tail cells count elsewhere, then)
-                        // this was a shared tail cell, but will no longer be a tail cell
-                        n_shared_tail_cells -= 1;
+                    if (rs_cell.tail_rc == 1) {
+                        if (rs_cell.seq_nodes.size() > 1) {
+                            // assuming the previous cell of a shared cell is also shared,
+                            // this was a shared tail cell, but will no longer be a tail cell
+                            n_shared_tail_cells -= 1;
+                        } else if (seq.tail < 0) {
+                            // no more tail, no more sequence
+                            n_seqs -= 1;
+                        }
                     }
                     GGML_ASSERT(rs_cell.tail_rc > 0);
                     rs_cell.tail_rc -= 1;
@@ -2341,21 +2310,30 @@ struct llama_rs_cache {
                     // this seq_id was the first in the list
                     seq.n_cells -= 1;
 
-                    // the next node is the new first one, so update its n_cells
-                    // (will never be out-of-bounds because the size is > 1)
-                    llama_rs_seq_node next_node = *(std::next(node_iter));
-                    if ((uint32_t) next_node.seq_id < seq_tails.size()) {
-                        auto & next_seq = seq_tails[next_node.seq_id];
-                        next_seq.n_cells += 1;
-                        // only the tail ref count from the other seq_ids are left in tail_rc
-                        if (rs_cell.tail_rc > 0) {
-                            // will become a non-shared cell
-                            if (rs_cell.seq_nodes.size() == 2) {
-                                n_seqs += 1;
+                    auto next_node = std::next(node_iter);
+                    if (next_node != rs_cell.seq_nodes.end()) {
+                        // the next node is the new first one, so update its n_cells
+                        if ((uint32_t) next_node->seq_id < seq_tails.size()) {
+                            auto & next_seq = seq_tails[next_node->seq_id];
+                            next_seq.n_cells += 1;
+                            // only the tail ref count from the other seq_ids are left in tail_rc
+                            if (rs_cell.tail_rc > 0) {
+                                // will become a non-shared cell
+                                if (rs_cell.seq_nodes.size() == 2) {
+                                    n_shared_tail_cells -= 1;
+                                    n_seqs += 1;
+                                }
                             }
+                        } else {
+                            GGML_ASSERT(false && "invalid seq_id");
                         }
                     } else {
-                        GGML_ASSERT(false && "invalid seq_id");
+                        // this was the last seq_id of the cell
+                        used -= 1;
+                        rs_cell.pos = -1;
+                        rs_cell.src = -1;
+                        rs_cell.prev = -1;
+                        // the other fields *should* have already been updated elsewhere
                     }
                 }
             } else {
@@ -2366,6 +2344,13 @@ struct llama_rs_cache {
         return node_iter;
     }
 
+    void clear_cell(llama_rs_cell & rs_cell) {
+        GGML_ASSERT(&rs_cell >= cells.data() && &rs_cell < cells.data() + cells.size());
+        for (auto node_iter = rs_cell.seq_nodes.begin(); node_iter != rs_cell.seq_nodes.end();) {
+            node_iter = remove_seq_node_from_cell(rs_cell, node_iter);
+        }
+    }
+
     // returns whether or not the seq_id was removed
     bool remove_seq_from_cell_id(uint32_t i_cell, const llama_seq_id & id) {
         if (i_cell < size && (size_t) id < size) {
@@ -2404,47 +2389,63 @@ struct llama_rs_cache {
                 prev_cell.tail_rc -= 1;
                 prev_node->next_cell = i_cell;
                 rs_cell.prev = prev;
+                if (seq.tail == prev) {
+                    // What to do when the tail moves...
+                    // from unique to shared (n_seqs--)
+                    //   if the new cell has one seq_id or has no tails (n_shared_tail_cells++)
+                    //   if the new cell has one seq_id and a tail (n_seqs-- (yes, another time))
+                    // from unique to unique (seq.n_cells++)
+                    // from empty  to unique (seq.n_cells++, n_seqs++)
+                    // from empty  to shared
+                    //   if the new cell only has one seq_id or has no tail (n_shared_tail_cells++)
+                    //   if the new cell only has one seq_id and has one tail (n_seqs--)
+                    // from shared to shared
+                    //   if the last cell has no tails (n_shared_tail_cells--)
+                    //   if the new cell has no tails or has one seq_id (n_shared_tail_cells++)
+                    //   if the new cell only has one seq_id and has one tail (n_seqs--)
+                    // from shared to unique (seq.n_cells++)
+                    //   if this seq_id was not the first of the last cell (n_seqs++)
+                    //   if the last cell has no tails (n_shared_tail_cells--)
+                    if (prev_cell.seq_nodes.size() > 1) {
+                        // from shared
+                        if (rs_cell.is_empty()) {
+                            // to unique
+                            if (prev_cell.seq_nodes[0].seq_id != id) {
+                                n_seqs += 1;
+                            }
+                        }
+                        // the previous cell is no longer a shared tail
+                        if (prev_cell.tail_rc == 0) {
+                            n_shared_tail_cells -= 1;
+                        }
+                    } else if (!rs_cell.is_empty()) {
+                        // from unique to shared
+                        n_seqs -= 1;
+                    }
+                }
             }
             if (rs_cell.is_empty()) {
-                // either the sequence didn't own any cells or had a shared tail cell
-                if (seq.n_cells == 0 || (seq.tail >= 0 && cells[seq.tail].seq_nodes.size() > 1)) {
-                    n_seqs += 1;
-                }
+                // to unique
                 seq.n_cells += 1;
-                // set pos if still unset
-                if (rs_cell.pos < 0) {
+                if (seq.tail < 0) {
+                    // from empty to unique
+                    n_seqs += 1;
+                    // pos was not yet set
                     rs_cell.pos = 0;
                     rs_cell.src = -1;
                 }
                 used += 1;
-            } else if (rs_cell.seq_nodes.size() == 1 && rs_cell.tail_rc == 1) {
-                // don't count shared-cell tails
-                // FIXME: make this saner
-                n_seqs -= 1;
-                n_shared_tail_cells += 1;
-            } else if (rs_cell.tail_rc == 0) {
-                // shared cell without a tail gets a tail;
-                // FIXME: don't prune, in case this is used in llama_cache_seq_cp
-                GGML_ASSERT(false); // make sure we don't get here by accident
-                // prune the other sequences out of this cell
-                // NOTE: have to inline the removal because the state tree is partially invalid
-                bool first = true;
-                for (auto & node : rs_cell.seq_nodes) {
-                    GGML_ASSERT(node.seq_id != id);
-                    GGML_ASSERT(node.next_cell >= 0);
-                    // easy removal, none of the nodes are tails
-                    llama_rs_cell & next_cell = cells[node.next_cell];
-                    next_cell.prev = rs_cell.prev;
-                    if (first) {
-                        auto & first_seq = seq_tails[node.seq_id];
-                        first_seq.n_cells -= 1;
-                        first = false;
+            } else {
+                // to shared
+                if (rs_cell.seq_nodes.size() == 1) {
+                    // a lone tail becomes a shared cell
+                    if (rs_cell.tail_rc > 0) {
+                        n_seqs -= 1;
                     }
+                    n_shared_tail_cells += 1;
+                } else if (rs_cell.tail_rc == 0) {
+                    n_shared_tail_cells += 1;
                 }
-                rs_cell.seq_nodes.clear();
-            } else if (rs_cell.seq_nodes.size() != rs_cell.tail_rc) {
-                // this is correct as long as this isn't called when trying to find a slot
-                // TODO: find a way to assert this
             }
             // the target cell was not already a tail of this seq_id
             rs_cell.insert_node(id); // next_cell == -1 by default
@@ -2977,6 +2978,7 @@ static bool llama_kv_cache_find_slot(
                             llama_rs_cell & candidate = cache.rs.cells[cell_id];
                             if (candidate.is_empty()) { break; }
                             if (candidate.tail_rc == 1 && seq.tail == (int32_t) cell_id) {
+                                // the candidate is the old tail
                                 if (candidate.seq_nodes.size() > 1) {
                                     // prune out the other seq_ids, because they diverge
                                     // TODO(maybe): hande this in insert_seq_tail_to_cell_id
@@ -3198,40 +3200,42 @@ static llama_pos llama_cache_seq_rm(
         llama_pos new_p0 = 0;
         llama_pos new_p1 = std::numeric_limits<llama_pos>::max();
 
-        for (uint32_t i = 0; i < cache.rs.size; ++i) {
-            llama_rs_cell & rs_cell = cache.rs.cells[i];
-            auto seq_node = std::find(rs_cell.seq_nodes.begin(), rs_cell.seq_nodes.end(), seq_id);
+        // partial seq_id removal has to happen from the tail
+        llama_rs_seq_meta & seq = cache.rs.seq_tails[seq_id];
+        int32_t cell_id = seq.tail;
+
+        while (cell_id >= 0) {
+            llama_rs_cell & rs_cell = cache.rs.cells[cell_id];
+            // copy before the cell is potentially changed
+            int32_t prev_id = rs_cell.prev;
+            if (rs_cell.pos >= p1 && rs_cell.seq_nodes.size() > 1) {
+                // non-tail removal for shared cells can only be done when clearing a cell
+                // (i.e. when the next cell's link to the previous cell can be safely changed)
+                p1 = rs_cell.pos + 1;
+            }
+            if (rs_cell.pos >= p0 && rs_cell.pos < p1) {
+                auto node_iter = std::find(rs_cell.seq_nodes.begin(), rs_cell.seq_nodes.end(), seq_id);
+                // if the node isn't found, the sequence tree is malformed
+                GGML_ASSERT(node_iter != rs_cell.seq_nodes.end());
+                cache.rs.remove_seq_node_from_cell(rs_cell, node_iter);
+                // get the smallest removed cell id
+                if (new_head > (uint32_t) cell_id) { new_head = cell_id; }
+            } else {
+                // one more than the biggest non-removed cell of this sequence
+                if (rs_cell.pos >= n_past) { n_past = rs_cell.pos + 1; }
 
-            if (seq_id < 0 || seq_node != rs_cell.seq_nodes.end()) {
                 if (rs_cell.pos < p0) {
-                    // move forward the new p0 further
-                    if (rs_cell.pos >= new_p0) {
-                        new_p0 = rs_cell.pos + 1;
-                    }
-                } else if (rs_cell.pos >= p1) {
-                    // move back the new p1 further
-                    if (rs_cell.pos < new_p1) {
-                        new_p1 = rs_cell.pos;
-                    }
-                    if (rs_cell.pos >= n_past) {
-                        n_past = rs_cell.pos + 1;
-                    }
-                } else { // (rs_cell.pos >= p0 && rs_cell.pos < p1)
-                    if (seq_id < 0) {
-                        cache.rs.clear_cell(rs_cell);
-                    } else { // (rs_cell.has_seq_id(seq_id))
-                        cache.rs.remove_seq_node_from_cell(rs_cell, seq_node);
-                    }
-                    if (rs_cell.is_empty() && new_head == cache.rs.size) {
-                        new_head = i;
-                    }
+                    // new_p0 should be right after the max pos in the states before p0
+                    if (rs_cell.pos >= new_p0) { new_p0 = rs_cell.pos + 1; }
+                } else { // (rs_cell.pos >= p1)
+                    // new_p1 should be the min pos in the states after p1
+                    if (rs_cell.pos < new_p1)  { new_p1 = rs_cell.pos; }
                 }
             }
+            cell_id = prev_id;
         }
         p0 = new_p0;
         p1 = new_p1;
-        // correctly set n_past when there's nothing after p1
-        if (n_past < p0) { n_past = p0; }
 
         // If we freed up a slot, set head to it so searching can start there.
         if (new_head != cache.rs.size && new_head < cache.rs.head) {
@@ -3259,10 +3263,8 @@ static llama_pos llama_cache_seq_rm(
                         kv_cell.pos = -1;
                         if (new_head == cache.kv.size) { new_head = i; }
                     }
-                } else {
-                    if (kv_cell.pos >= n_past) {
-                        n_past = kv_cell.pos + 1;
-                    }
+                } else if (kv_cell.pos >= n_past) {
+                    n_past = kv_cell.pos + 1;
                 }
             }
         }
@@ -3292,42 +3294,37 @@ static llama_pos llama_cache_seq_cp(
     llama_pos n_past = 0;
 
     if (cache.rs.size > 0) {
-        // have to start from beginning for recurrent models
+        // have to start from the beginning for recurrent models
         p0 = 0;
         if ((uint32_t) seq_id_dst < cache.rs.size && (uint32_t) seq_id_src < cache.rs.size) {
-            auto seq_src = cache.rs.seq_tails[seq_id_src];
-            int32_t src_tail = seq_src.tail;
-            // find the last tail of src in the pos range
-            while (src_tail >= 0 && (uint32_t) src_tail < cache.rs.size) {
-                llama_rs_cell & tail_cell = cache.rs.cells[src_tail];
-                if (tail_cell.pos < p1) {
-                    break;
-                }
-                src_tail = tail_cell.prev;
-            }
-
-            uint32_t new_head = cache.rs.size;
-
+            int32_t src_head = -1;
+            int32_t head_pos = p1;
+            int32_t src_next = -1;
+            // find the start of the sequence
             for (uint32_t i = 0; i < cache.rs.size; ++i) {
                 llama_rs_cell & rs_cell = cache.rs.cells[i];
-                if (rs_cell.pos >= p0 && rs_cell.pos < p1 && rs_cell.has_seq_id(seq_id_src)) {
-                    if (i == (uint32_t) src_tail) {
-                        // need to be inserted in order, but there's only one
-                        cache.rs.insert_seq_tail_to_cell_id(i, seq_id_dst);
-                    } else {
-                        // keep only the tail cell of the source
-                        // assuming a copy means no rollback will be attempted afterwards
-                        cache.rs.remove_seq_from_cell_id(i, seq_id_src);
-                        if (new_head == cache.rs.size) {
-                            new_head = i;
-                        }
+                if (!rs_cell.is_empty() && rs_cell.prev < 0) {
+                    auto seq_node = std::find(rs_cell.seq_nodes.begin(), rs_cell.seq_nodes.end(), seq_id_src);
+                    if (seq_node != rs_cell.seq_nodes.end()) {
+                        src_head = i;
+                        head_pos = rs_cell.pos;
+                        src_next = seq_node->next_cell;
+                        break;
                     }
                 }
             }
-
-            // If we freed up a slot, set head to it so searching can start there.
-            if (new_head != cache.rs.size && new_head < cache.rs.head) {
-                cache.rs.head = new_head;
+            while (src_head >= 0 && head_pos < p1) {
+                cache.rs.insert_seq_tail_to_cell_id(src_head, seq_id_dst);
+                src_head = src_next;
+                if (head_pos >= n_past) { n_past = head_pos + 1; }
+                if (src_next >= 0) {
+                    llama_rs_cell & rs_cell = cache.rs.cells[src_next];
+                    auto seq_node = std::find(rs_cell.seq_nodes.begin(), rs_cell.seq_nodes.end(), seq_id_src);
+                    head_pos = rs_cell.pos;
+                    // it should always be found if the seq tree is valid
+                    GGML_ASSERT(seq_node != rs_cell.seq_nodes.end());
+                    src_next = seq_node->next_cell;
+                }
             }
         }
         p1 = n_past;
@@ -3338,9 +3335,7 @@ static llama_pos llama_cache_seq_cp(
             llama_kv_cell & kv_cell = cache.kv.cells[i];
             if (kv_cell.pos >= p0 && kv_cell.pos < p1 && kv_cell.has_seq_id(seq_id_src)) {
                 kv_cell.seq_id.insert(seq_id_dst);
-                if (kv_cell.pos >= n_past) {
-                    n_past = kv_cell.pos + 1;
-                }
+                if (kv_cell.pos >= n_past) { n_past = kv_cell.pos + 1; }
             }
         }
     }
@@ -3352,18 +3347,19 @@ static void llama_cache_seq_keep(struct llama_cache & cache, llama_seq_id seq_id
     if (cache.rs.size > 0) {
         uint32_t new_head = cache.rs.size;
 
-        for (uint32_t i = 0; i < cache.rs.size; ++i) {
-            llama_rs_cell & rs_cell = cache.rs.cells[i];
-            if (!rs_cell.seq_nodes.empty()) {
-                for (auto node_iter = rs_cell.seq_nodes.begin(); node_iter != rs_cell.seq_nodes.end();) {
-                    if (node_iter->seq_id != seq_id) {
-                        node_iter = cache.rs.remove_seq_node_from_cell(rs_cell, node_iter);
-                    } else {
-                        node_iter = std::next(node_iter);
-                    }
-                }
-                if (new_head == cache.rs.size && rs_cell.is_empty()) {
-                    new_head = i;
+        // partial seq_id removal has to happen from the tail(s)
+        for (uint32_t i = 0; i < cache.rs.seq_tails.size(); ++i) {
+            if (i == (uint32_t) seq_id) { continue; }
+            llama_rs_seq_meta & seq = cache.rs.seq_tails[i];
+            int32_t cell_id = seq.tail;
+            while (cell_id >= 0) {
+                llama_rs_cell & rs_cell = cache.rs.cells[cell_id];
+                auto node_iter = std::find(rs_cell.seq_nodes.begin(), rs_cell.seq_nodes.end(), i);
+                GGML_ASSERT(node_iter != rs_cell.seq_nodes.end());
+                cache.rs.remove_seq_node_from_cell(rs_cell, node_iter);
+                cell_id = rs_cell.prev;
+                if (new_head > (uint32_t) cell_id && rs_cell.is_empty()) {
+                    new_head = cell_id;
                 }
             }
         }
@@ -3414,6 +3410,7 @@ static llama_pos llama_cache_seq_add(
         auto & seq = cache.rs.seq_tails[seq_id];
         // follow the sequence from its tail
         int32_t cell_id = seq.tail;
+        uint32_t new_head = cache.rs.size;
         while (cell_id >= 0) {
             GGML_ASSERT((uint32_t) cell_id < cache.rs.size);
             llama_rs_cell & rs_cell = cache.rs.cells[cell_id];
@@ -3423,13 +3420,19 @@ static llama_pos llama_cache_seq_add(
                 if (rs_cell.pos < 0) {
                     // NOTE: this affects the other sequences which share the cell
                     cache.rs.clear_cell(rs_cell);
-                    // TODO: update cache.rs.head
+                    if (new_head > (uint32_t) cell_id) {
+                        new_head = cell_id;
+                    }
                 }
             }
             if (n_past <= rs_cell.pos) {
                 n_past = rs_cell.pos + 1;
             }
         }
+
+        // If we freed up a slot, set head to it so searching can start there.
+        // Otherwise we just start the next search from the beginning.
+        cache.rs.head = new_head != cache.rs.size ? new_head : 0;
     }
 
     if (cache.kv.size > 0) {
@@ -3474,8 +3477,8 @@ static llama_pos llama_cache_seq_div(
                  llama_pos   p0,
                  llama_pos   p1,
                        int   d) {
-    if (p0 < 0) p0 = 0;
-    if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
+    if (p0 < 0) { p0 = 0; }
+    if (p1 < 0) { p1 = std::numeric_limits<llama_pos>::max(); }
 
     llama_pos n_past = p0;
 
@@ -11275,6 +11278,10 @@ static int llama_decode_internal(
             }
         }
         n_outputs_prev += lctx.n_outputs;
+
+#ifndef NDEBUG
+        GGML_ASSERT(lctx.cache.rs.rebuild(true));
+#endif
     }
 
     // wait for the computation to finish (automatically done when obtaining the model output)
@@ -16332,11 +16339,19 @@ void llama_batch_free(struct llama_batch batch) {
 int32_t llama_decode(
         struct llama_context * ctx,
           struct llama_batch   batch) {
+#ifndef NDEBUG
+    GGML_ASSERT(ctx->cache.rs.rebuild(true));
+#endif
+
     const int ret = llama_decode_internal(*ctx, batch);
     if (ret < 0) {
         LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
     }
 
+#ifndef NDEBUG
+    GGML_ASSERT(ctx->cache.rs.rebuild(true));
+#endif
+
     return ret;
 }