diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 9eae6fe57..910e2243d 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -8,8 +8,6 @@ #include #include -// llama input - static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) { // TODO move to hparams if a T5 variant appears that uses a different value const int64_t max_distance = 128; @@ -34,56 +32,88 @@ static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t return relative_bucket; } -void llama_context::set_k_shift(llama_kv_cache & kv) { - assert(ggml_backend_buffer_is_host(inp_K_shift->buffer)); - - int32_t * data = (int32_t *) inp_K_shift->data; - - for (uint32_t i = 0; i < kv.size; ++i) { - data[i] = kv.cells[i].delta; - } +// TODO: improve +void llama_context::reset() { + inp_tokens = nullptr; + inp_embd = nullptr; + inp_pos = nullptr; + inp_out_ids = nullptr; + inp_mean = nullptr; + inp_cls = nullptr; + inp_embd_enc = nullptr; + inp_pos_bucket = nullptr; + inp_KQ_mask = nullptr; + inp_KQ_mask_cnv = nullptr; + inp_KQ_mask_swa = nullptr; + inp_KQ_mask_swa_cnv = nullptr; + inp_KQ_mask_cross = nullptr; + inp_K_shift = nullptr; + inp_s_copy = nullptr; + inp_s_mask = nullptr; } -void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { +void llama_context::prepare_k_shift() { +} + +void llama_context::prepare_defrag() { +} + +void llama_context::prepare_decode(const llama_ubatch & /*ubatch*/) { +} + +// llama input + +void llama_context::set_inputs(const llama_ubatch & ubatch) { + const llama_hparams & hparams = model.hparams; + // // set input data // - const auto & hparams = lctx.model.hparams; - const auto & cparams = lctx.cparams; - const auto & kv_self = lctx.kv_self; + if (inp_K_shift) { + assert(ggml_backend_buffer_is_host(inp_K_shift->buffer)); + + int32_t * data = (int32_t *) inp_K_shift->data; + + for (uint32_t i = 0; i < kv_self.size; ++i) { + data[i] = kv_self.cells[i].delta; + } + + // the K-shift graph requires just this input + return; + } if (ubatch.token) { const int64_t n_tokens = ubatch.n_tokens; - ggml_backend_tensor_set(lctx.inp_tokens, ubatch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens)); + ggml_backend_tensor_set(inp_tokens, ubatch.token, 0, n_tokens*ggml_element_size(inp_tokens)); } if (ubatch.embd) { const int64_t n_embd = hparams.n_embd; const int64_t n_tokens = ubatch.n_tokens; - ggml_backend_tensor_set(lctx.inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd)); + ggml_backend_tensor_set(inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(inp_embd)); } - if (ubatch.pos && lctx.inp_pos) { + if (ubatch.pos && inp_pos) { const int64_t n_tokens = ubatch.n_tokens; - auto n_pos = lctx.n_pos_per_token; - ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*n_pos*ggml_element_size(lctx.inp_pos)); + auto n_pos = n_pos_per_token; + ggml_backend_tensor_set(inp_pos, ubatch.pos, 0, n_tokens*n_pos*ggml_element_size(inp_pos)); } if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) { - //GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs"); + //GGML_ASSERT(inp_out_ids && "every model that can must skip unused outputs"); - if (!lctx.inp_out_ids) { - LLAMA_LOG_WARN("%s: 'lctx.inp_out_ids' is not created\n", __func__); + if (!inp_out_ids) { + LLAMA_LOG_WARN("%s: 'inp_out_ids' is not created\n", __func__); } else { const int64_t n_tokens = ubatch.n_tokens; - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer)); - int32_t * data = (int32_t *) lctx.inp_out_ids->data; + GGML_ASSERT(ggml_backend_buffer_is_host(inp_out_ids->buffer)); + int32_t * data = (int32_t *) inp_out_ids->data; - if (lctx.n_outputs == n_tokens) { + if (n_outputs == n_tokens) { for (int i = 0; i < n_tokens; ++i) { data[i] = i; } @@ -95,26 +125,26 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { } } // the graph needs to have been passed the correct number of outputs - GGML_ASSERT(lctx.n_outputs == n_outputs); - } else if (lctx.n_outputs == 1) { + GGML_ASSERT(n_outputs == n_outputs); + } else if (n_outputs == 1) { // only keep last output data[0] = n_tokens - 1; } else { - GGML_ASSERT(lctx.n_outputs == 0); + GGML_ASSERT(n_outputs == 0); } } } GGML_ASSERT( - // (!a || b) is a logical implication (a -> b) - // !hparams.causal_attn -> !cparams.causal_attn - (hparams.causal_attn || !cparams.causal_attn) && - "causal attention is not supported by this model" - ); + // (!a || b) is a logical implication (a -> b) + // !hparams.causal_attn -> !cparams.causal_attn + (hparams.causal_attn || !cparams.causal_attn) && + "causal attention is not supported by this model" + ); - if (lctx.inp_KQ_mask || lctx.inp_KQ_mask_swa) { + if (inp_KQ_mask || inp_KQ_mask_swa) { // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache. - if (cparams.causal_attn && !lctx.is_encoding) { + if (cparams.causal_attn && !is_encoding) { const int64_t n_kv = kv_self.n; const int64_t n_tokens = ubatch.n_tokens; const int64_t n_seq_tokens = ubatch.n_seq_tokens; @@ -123,14 +153,14 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { float * data = nullptr; float * data_swa = nullptr; - if (lctx.inp_KQ_mask) { - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer)); - data = (float *) lctx.inp_KQ_mask->data; + if (inp_KQ_mask) { + GGML_ASSERT(ggml_backend_buffer_is_host(inp_KQ_mask->buffer)); + data = (float *) inp_KQ_mask->data; } - if (lctx.inp_KQ_mask_swa) { - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask_swa->buffer)); - data_swa = (float *) lctx.inp_KQ_mask_swa->data; + if (inp_KQ_mask_swa) { + GGML_ASSERT(ggml_backend_buffer_is_host(inp_KQ_mask_swa->buffer)); + data_swa = (float *) inp_KQ_mask_swa->data; } // For causal attention, use only the previous KV cells @@ -191,11 +221,11 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { const int64_t n_seq_tokens = ubatch.n_seq_tokens; const int64_t n_seqs = ubatch.n_seqs; // when using kv cache, the mask needs to match the kv cache size - const int64_t n_stride = hparams.causal_attn && !lctx.is_encoding ? kv_self.n : n_tokens; + const int64_t n_stride = hparams.causal_attn && !is_encoding ? kv_self.n : n_tokens; - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer)); + GGML_ASSERT(ggml_backend_buffer_is_host(inp_KQ_mask->buffer)); - float * data = (float *) lctx.inp_KQ_mask->data; + float * data = (float *) inp_KQ_mask->data; for (int h = 0; h < 1; ++h) { for (int s1 = 0; s1 < n_seqs; ++s1) { @@ -238,11 +268,11 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { const int64_t n_seq_tokens = ubatch.n_seq_tokens; const int64_t n_seqs = ubatch.n_seqs; - GGML_ASSERT(lctx.inp_mean); - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer)); + GGML_ASSERT(inp_mean); + GGML_ASSERT(ggml_backend_buffer_is_host(inp_mean->buffer)); - float * data = (float *) lctx.inp_mean->data; - memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean)); + float * data = (float *) inp_mean->data; + memset(inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(inp_mean)); std::vector sum(n_tokens, 0); @@ -279,11 +309,11 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { const int64_t n_seq_tokens = ubatch.n_seq_tokens; const int64_t n_seqs = ubatch.n_seqs; - GGML_ASSERT(lctx.inp_cls); - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer)); + GGML_ASSERT(inp_cls); + GGML_ASSERT(ggml_backend_buffer_is_host(inp_cls->buffer)); - uint32_t * data = (uint32_t *) lctx.inp_cls->data; - memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls)); + uint32_t * data = (uint32_t *) inp_cls->data; + memset(inp_cls->data, 0, n_tokens * ggml_element_size(inp_cls)); for (int s = 0; s < n_seqs; ++s) { const llama_seq_id seq_id = ubatch.seq_id[s][0]; @@ -306,11 +336,11 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { const int64_t n_seq_tokens = ubatch.n_seq_tokens; const int64_t n_seqs = ubatch.n_seqs; - GGML_ASSERT(lctx.inp_cls); - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer)); + GGML_ASSERT(inp_cls); + GGML_ASSERT(ggml_backend_buffer_is_host(inp_cls->buffer)); - uint32_t * data = (uint32_t *) lctx.inp_cls->data; - memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls)); + uint32_t * data = (uint32_t *) inp_cls->data; + memset(inp_cls->data, 0, n_tokens * ggml_element_size(inp_cls)); std::vector last_pos(n_tokens, -1); std::vector last_row(n_tokens, -1); @@ -341,17 +371,18 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { if (kv_self.recurrent) { const int64_t n_kv = kv_self.n; - if (lctx.inp_s_mask) { - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_s_mask->buffer)); - float * data = (float *) lctx.inp_s_mask->data; + if (inp_s_mask) { + GGML_ASSERT(ggml_backend_buffer_is_host(inp_s_mask->buffer)); + float * data = (float *) inp_s_mask->data; // clear unused states for (int i = 0; i < n_kv; ++i) { const uint32_t cell_id = i + kv_self.head; - llama_kv_cell & kv_cell = lctx.kv_self.cells[cell_id]; + llama_kv_cell & kv_cell = kv_self.cells[cell_id]; data[i] = (float) (kv_cell.src >= 0); + // TODO: do not mutate the KV cache // only clear once if (kv_cell.src < 0) { kv_cell.src = cell_id; @@ -359,14 +390,14 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { } } - if (lctx.inp_s_copy) { - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_s_copy->buffer)); - int32_t * data = (int32_t *) lctx.inp_s_copy->data; + if (inp_s_copy) { + GGML_ASSERT(ggml_backend_buffer_is_host(inp_s_copy->buffer)); + int32_t * data = (int32_t *) inp_s_copy->data; // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n for (uint32_t i = 0; i < n_kv; ++i) { const uint32_t cell_id = i + kv_self.head; - llama_kv_cell & kv_cell = lctx.kv_self.cells[cell_id]; + llama_kv_cell & kv_cell = kv_self.cells[cell_id]; // prevent out-of-bound sources if (kv_cell.src < 0 || (uint32_t) kv_cell.src >= kv_self.size) { @@ -375,6 +406,7 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { data[i] = kv_cell.src; + // TODO: do not mutate the KV cache // ensure copy only happens once if (kv_cell.src != (int32_t) cell_id) { kv_cell.src = cell_id; @@ -383,20 +415,20 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { } } - if (lctx.inp_pos_bucket) { + if (inp_pos_bucket) { const int64_t n_tokens = ubatch.n_tokens; - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_pos_bucket->buffer)); + GGML_ASSERT(ggml_backend_buffer_is_host(inp_pos_bucket->buffer)); GGML_ASSERT(!ubatch.equal_seqs); // TODO: use ubatch.n_seqs instead of failing - int32_t * data = (int32_t *) lctx.inp_pos_bucket->data; + int32_t * data = (int32_t *) inp_pos_bucket->data; - if (!lctx.is_encoding) { + if (!is_encoding) { const int64_t n_kv = kv_self.n; for (int h = 0; h < 1; ++h) { for (int j = 0; j < n_tokens; ++j) { for (int i = 0; i < n_kv; ++i) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(lctx.kv_self.cells[i].pos, ubatch.pos[j], hparams.n_rel_attn_bkts, lctx.is_encoding); + data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(kv_self.cells[i].pos, ubatch.pos[j], hparams.n_rel_attn_bkts, is_encoding); } } } @@ -404,28 +436,28 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { for (int h = 0; h < 1; ++h) { for (int j = 0; j < n_tokens; ++j) { for (int i = 0; i < n_tokens; ++i) { - data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(ubatch.pos[i], ubatch.pos[j], hparams.n_rel_attn_bkts, lctx.is_encoding); + data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(ubatch.pos[i], ubatch.pos[j], hparams.n_rel_attn_bkts, is_encoding); } } } } } - if (!lctx.is_encoding && lctx.inp_embd_enc) { - assert(lctx.inp_embd_enc->type == GGML_TYPE_F32); - assert((size_t) ggml_nelements(lctx.inp_embd_enc) == lctx.embd_enc.size()); + if (!is_encoding && inp_embd_enc) { + assert(inp_embd_enc->type == GGML_TYPE_F32); + assert((size_t) ggml_nelements(inp_embd_enc) == embd_enc.size()); - ggml_backend_tensor_set(lctx.inp_embd_enc, lctx.embd_enc.data(), 0, ggml_nbytes(lctx.inp_embd_enc)); + ggml_backend_tensor_set(inp_embd_enc, embd_enc.data(), 0, ggml_nbytes(inp_embd_enc)); } - if (!lctx.is_encoding && lctx.inp_KQ_mask_cross) { - const int64_t n_output_enc = lctx.embd_enc.size() / hparams.n_embd; + if (!is_encoding && inp_KQ_mask_cross) { + const int64_t n_output_enc = embd_enc.size() / hparams.n_embd; const int64_t n_tokens = ubatch.n_tokens; - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask_cross->buffer)); + GGML_ASSERT(ggml_backend_buffer_is_host(inp_KQ_mask_cross->buffer)); GGML_ASSERT(!ubatch.equal_seqs); // TODO: use ubatch.n_seqs instead of failing - float * data = (float *) lctx.inp_KQ_mask_cross->data; + float * data = (float *) inp_KQ_mask_cross->data; for (int h = 0; h < 1; ++h) { for (int j = 0; j < n_tokens; ++j) { @@ -433,7 +465,7 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { float f = -INFINITY; for (int s = 0; s < ubatch.n_seq_id[j]; ++s) { const llama_seq_id seq_id = ubatch.seq_id[j][s]; - if (lctx.seq_ids_enc[i].find(seq_id) != lctx.seq_ids_enc[i].end()) { + if (seq_ids_enc[i].find(seq_id) != seq_ids_enc[i].end()) { f = 0.0f; } } @@ -450,6 +482,851 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { } } +// do mat_mul, while optionally apply lora +ggml_tensor * llama_context::build_lora_mm( + ggml_context * ctx0, + ggml_tensor * w, + ggml_tensor * cur) { + struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur); + + for (const auto & lora : loras) { + struct llama_adapter_lora_weight * lw = lora.first->get_weight(w); + if (lw == nullptr) { + continue; + } + + const float adapter_scale = lora.second; + const float scale = lw->get_scale(lora.first->alpha, adapter_scale); + + struct ggml_tensor * ab_cur = ggml_mul_mat( + ctx0, lw->b, + ggml_mul_mat(ctx0, lw->a, cur) + ); + + ab_cur = ggml_scale(ctx0, ab_cur, scale); + res = ggml_add(ctx0, res, ab_cur); + } + + return res; +} + +// do mat_mul_id, while optionally apply lora +ggml_tensor * llama_context::build_lora_mm_id( + ggml_context * ctx0, + ggml_tensor * w, + ggml_tensor * cur, + ggml_tensor * ids) { + struct ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids); + for (const auto & lora : loras) { + struct llama_adapter_lora_weight * lw = lora.first->get_weight(w); + if (lw == nullptr) { + continue; + } + + const float alpha = lora.first->alpha; + const float rank = (float) lw->b->ne[0]; + const float scale = alpha ? lora.second * alpha / rank : lora.second; + + struct ggml_tensor * ab_cur = ggml_mul_mat_id( + ctx0, lw->b, + ggml_mul_mat_id(ctx0, lw->a, cur, ids), + ids + ); + + ab_cur = ggml_scale(ctx0, ab_cur, scale); + res = ggml_add(ctx0, res, ab_cur); + } + + return res; +} + +void llama_context::build_attn_inp( + ggml_context * ctx0, + int32_t n_tokens, + bool causal, + bool swa, + bool worst_case) { + const auto & hparams = model.hparams; + + const auto n_kv = worst_case ? kv_self.size : kv_self.n; + + inp_KQ_mask = causal + ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)) + : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); + //cb(inp_KQ_mask, "KQ_mask", -1); + ggml_set_input(inp_KQ_mask); + + inp_KQ_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_KQ_mask, GGML_TYPE_F16) : inp_KQ_mask; + + if (swa) { + GGML_ASSERT(hparams.n_swa > 0); + + inp_KQ_mask_swa = causal + ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)) + : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); + //cb(inp_KQ_mask_swa, "KQ_mask_swa", -1); + ggml_set_input(inp_KQ_mask_swa); + + inp_KQ_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_KQ_mask_swa, GGML_TYPE_F16) : inp_KQ_mask_swa; + } +} + +void llama_context::build_attn_kv_store( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * k_cur, + ggml_tensor * v_cur, + int32_t n_tokens, + int64_t il, + bool worst_case) { + const auto & hparams = model.hparams; + + const auto & n_ctx = cparams.n_ctx; + + const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head; + + const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); + const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); + + GGML_ASSERT(kv_self.size == n_ctx); + + struct ggml_tensor * k_cache_view = ggml_view_1d(ctx0, kv_self.k_l[il], n_tokens*n_embd_k_gqa, ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa)*kv_head); + //cb(k_cache_view, "k_cache_view", il); + + // note: storing RoPE-ed version of K in the KV cache + ggml_build_forward_expand(graph, ggml_cpy(ctx0, k_cur, k_cache_view)); + + assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens); + + struct ggml_tensor * v_cache_view = nullptr; + + if (cparams.flash_attn) { + v_cache_view = ggml_view_1d(ctx0, kv_self.v_l[il], n_tokens*n_embd_v_gqa, ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa)*kv_head); + } else { + // note: the V cache is transposed when not using flash attention + v_cache_view = ggml_view_2d(ctx0, kv_self.v_l[il], n_tokens, n_embd_v_gqa, + ( n_ctx)*ggml_element_size(kv_self.v_l[il]), + (kv_head)*ggml_element_size(kv_self.v_l[il])); + + v_cur = ggml_transpose(ctx0, v_cur); + } + //cb(v_cache_view, "v_cache_view", il); + + ggml_build_forward_expand(graph, ggml_cpy(ctx0, v_cur, v_cache_view)); +} + +ggml_tensor * llama_context::build_attn_qkv( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * wo, + ggml_tensor * wo_b, + ggml_tensor * q_cur, + int32_t n_tokens, + float kq_scale, + int il, + bool worst_case) { + const auto & hparams = model.hparams; + + const auto & n_ctx = cparams.n_ctx; + const auto & n_embd_head_k = hparams.n_embd_head_k; + const auto & n_embd_head_v = hparams.n_embd_head_v; + + // TODO: improve + bool is_sliding = false; + + switch (model.arch) { + case LLM_ARCH_COHERE2: + { + const int32_t sliding_window_pattern = 4; + is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1); + } break; + case LLM_ARCH_GEMMA2: + { + const int32_t sliding_window_pattern = 2; + is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1); + } break; + case LLM_ARCH_PHI3: + { + is_sliding = hparams.n_swa > 0; + } break; + default: + { + is_sliding = false; + } + }; + + const auto & kq_mask = is_sliding ? inp_KQ_mask_swa_cnv : inp_KQ_mask_cnv; + + const auto n_kv = worst_case ? kv_self.size : kv_self.n; + + const int64_t n_head = hparams.n_head(il); + const int64_t n_head_kv = hparams.n_head_kv(il); + const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); + const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); + + struct ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3); + //cb(q, "q", il); + + struct ggml_tensor * k = + ggml_view_3d(ctx0, kv_self.k_l[il], + n_embd_head_k, n_kv, n_head_kv, + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), + ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), + 0); + //cb(k, "k", il); + + struct ggml_tensor * cur; + + if (cparams.flash_attn) { + GGML_UNUSED(model); + GGML_UNUSED(n_ctx); + + // split cached v into n_head heads (not transposed) + struct ggml_tensor * v = + ggml_view_3d(ctx0, kv_self.v_l[il], + n_embd_head_v, n_kv, n_head_kv, + ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa), + ggml_row_size(kv_self.v_l[il]->type, n_embd_head_v), + 0); + //cb(v, "v", il); + + cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias, + hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f); + + ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); + + cur = ggml_reshape_2d(ctx0, cur, n_embd_head_v*n_head, n_tokens); + } else { + struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + //cb(kq, "kq", il); + + // note: this op tends to require high floating point range + // while for some models F16 is enough, for others it is not, so we default to F32 here + ggml_mul_mat_set_prec(kq, GGML_PREC_F32); + + if (model.arch == LLM_ARCH_GROK) { + // need to do the following: + // multiply by attn_output_multiplyer of 0.08838834764831845 + // and then : + // kq = 30 * tanh(kq / 30) + // before the softmax below + + kq = ggml_tanh(ctx0, ggml_scale(ctx0, kq, 0.08838834764831845f/30.0f)); + kq = ggml_scale(ctx0, kq, 30); + } + + if (hparams.attn_soft_cap) { + kq = ggml_scale(ctx0, kq, 1.0f / hparams.f_attn_logit_softcapping); + kq = ggml_tanh(ctx0, kq); + kq = ggml_scale(ctx0, kq, hparams.f_attn_logit_softcapping); + } + + kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias); + //cb(kq, "kq_soft_max_ext", il); + + GGML_ASSERT(kv_self.size == n_ctx); + + // split cached v into n_head heads + struct ggml_tensor * v = + ggml_view_3d(ctx0, kv_self.v_l[il], + n_kv, n_embd_head_v, n_head_kv, + ggml_element_size(kv_self.v_l[il])*n_ctx, + ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v, + 0); + //cb(v, "v", il); + + struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); + //cb(kqv, "kqv", il); + + struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); + //cb(kqv_merged, "kqv_merged", il); + + cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens); + //cb(cur, "kqv_merged_cont", il); + + if (!cparams.offload_kqv) { + // all nodes between the KV store and the attention output are run on the CPU + ggml_backend_sched_set_tensor_backend(sched.get(), cur, backend_cpu); + } + } + + ggml_build_forward_expand(graph, cur); + + if (wo) { + cur = build_lora_mm(ctx0, wo, cur); + } + + if (wo_b) { + //cb(cur, "kqv_wo", il); + } + + if (wo_b) { + cur = ggml_add(ctx0, cur, wo_b); + } + + return cur; +} + +ggml_tensor * llama_context::build_soft_max_ext( + ggml_context * ctx0, + ggml_tensor * kq, + float kq_scale) { + const auto & hparams = model.hparams; + + return ggml_soft_max_ext(ctx0, kq, inp_KQ_mask_cnv, kq_scale, hparams.f_max_alibi_bias); +} + +ggml_tensor * llama_context::get_rope_factors(int il) { + const auto & hparams = model.hparams; + + // choose long/short freq factors based on the context size + const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max; + + if (model.layers[il].rope_freqs != nullptr) { + return model.layers[il].rope_freqs; + } + + if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) { + return model.layers[il].rope_long; + } + + return model.layers[il].rope_short; +} + +void llama_context::build_k_shift( + ggml_context * ctx0, + ggml_cgraph * graph) { + const auto & n_ctx = cparams.n_ctx; + const auto & n_ctx_orig = cparams.n_ctx_orig_yarn; + const auto & freq_base = cparams.rope_freq_base; + const auto & freq_scale = cparams.rope_freq_scale; + + const auto & yarn_ext_factor = cparams.yarn_ext_factor; + const auto & yarn_attn_factor = cparams.yarn_attn_factor; + const auto & yarn_beta_fast = cparams.yarn_beta_fast; + const auto & yarn_beta_slow = cparams.yarn_beta_slow; + + const auto & hparams = model.hparams; + + const auto & n_rot = hparams.n_rot; + const auto & n_layer = hparams.n_layer; + const auto & rope_type = hparams.rope_type; + + const auto & n_embd_head_k = hparams.n_embd_head_k; + //const auto & n_embd_head_v = hparams.n_embd_head_v; + + GGML_ASSERT(kv_self.size == n_ctx); + + inp_K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); + //cb(inp_K_shift, "K_shift", -1); + ggml_set_input(inp_K_shift); + + for (uint32_t il = 0; il < n_layer; ++il) { + const int64_t n_head_kv = hparams.n_head_kv(il); + const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); + + struct ggml_tensor * rope_factors = get_rope_factors(il); + + struct ggml_tensor * k = + ggml_view_3d(ctx0, kv_self.k_l[il], + n_embd_head_k, n_head_kv, n_ctx, + ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), + 0); + + struct ggml_tensor * tmp; + if (ggml_is_quantized(k->type)) { + // dequantize to f32 -> RoPE -> quantize back + tmp = ggml_cast(ctx0, k, GGML_TYPE_F32); + //cb(tmp, "K_f32", il); + + for (auto & backend : backends) { + // Figure out which backend KV cache belongs to + if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(kv_self.k_l[il]->buffer))) { + ggml_backend_sched_set_tensor_backend(sched.get(), tmp, backend.get()); + break; + } + } + tmp = ggml_rope_ext_inplace(ctx0, tmp, + inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow); + //cb(tmp, "K_shifted_f32", il); + + tmp = ggml_cpy(ctx0, tmp, k); + } else { + // we rotate only the first n_rot dimensions + tmp = ggml_rope_ext_inplace(ctx0, k, + inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow); + } + //cb(tmp, "K_shifted", il); + + ggml_build_forward_expand(graph, tmp); + } +} + +void llama_context::build_defrag( + ggml_context * ctx0, + ggml_cgraph * graph) { + const auto & hparams = model.hparams; + + const uint32_t n_layer = hparams.n_layer; + + const uint32_t n_kv = kv_self.cell_max(); + const uint32_t n_used = kv_self.used; + + assert(n_used <= n_kv); + + //const int64_t t_start = ggml_time_us(); + + // number of cells moved + uint32_t n_moves = 0; + + // each move requires 6*n_layer tensors (see build_defrag) + // - source view, destination view, copy operation + // - x2 for keys and values + //const uint32_t max_moves = model.max_nodes()/(6*n_layer); + // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516 + const uint32_t max_moves = (model.max_nodes() - 2*n_layer)/(6*n_layer); + + // determine which KV cells to move where + // + // cell i moves to ids[i] + // + // if ids[i] == i || ids[i] == n_kv, then cell i is not moved + // + std::vector ids(n_kv, n_kv); + + for (uint32_t i0 = 0; i0 < n_used; ++i0) { + const auto & cell0 = kv_self.cells[i0]; + + if (!cell0.is_empty()) { + ids[i0] = i0; + + continue; + } + + // found a hole - fill it with data from the end of the cache + + uint32_t nh = 1; + + // determine the size of the hole + while (i0 + nh < n_used && kv_self.cells[i0 + nh].is_empty()) { + nh++; + } + + uint32_t nf = 0; + uint32_t is = n_kv - 1; + + // starting from the end, find nh non-empty cells + for (; is > i0; --is) { + const auto & cell1 = kv_self.cells[is]; + + if (cell1.is_empty() || ids[is] != n_kv) { + continue; + } + + // non-empty cell which is not yet moved + nf++; + + if (nf == nh) { + break; + } + } + + // this can only happen if `n_used` is not accurate, which would be a bug + GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh"); + + nf = 0; + + uint32_t i1 = is; + + // are we moving a continuous block of memory? + bool cont = false; + + // should we stop searching for the next move? + bool stop = false; + + // go back and move the nf cells to the hole + for (; i1 < n_kv; ++i1) { + auto & cell1 = kv_self.cells[i1]; + + if (cell1.is_empty() || ids[i1] != n_kv) { + if (n_moves == max_moves) { + stop = true; + break; + } + + cont = false; + continue; + } + + // this cell goes to (i0 + nf) + ids[i1] = i0 + nf; + + // move the cell meta data + kv_self.cells[i0 + nf] = cell1; + + // clear the old cell and move the head there + cell1 = llama_kv_cell(); + kv_self.head = n_used; + + if (!cont) { + n_moves++; + cont = true; + } + + nf++; + + if (nf == nh) { + break; + } + } + + if (stop || n_moves == max_moves) { + break; + } + + //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh); + + i0 += nh - 1; + } + + if (n_moves == 0) { + return; + } + + //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves); + + //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer); + +#if 0 + // CPU defrag + // + // TODO: optimizations are possible: + // - multiple threads + // - avoid copying to the host memory when already there + // + // likely not worth the effort, as we have ggml_graph based defrag + // + + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); + + const uint32_t kv_size = kv_self.size; + + std::vector buf_k; + std::vector buf_v; + + for (uint32_t il = 0; il < n_layer; ++il) { + const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa); + const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_size); + + const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type); + const size_t v_size = ggml_row_size (kv_self.v_l[il]->type, n_embd_v_gqa*kv_size); + + buf_k.resize(k_size); + buf_v.resize(v_size); + + ggml_backend_tensor_get(kv_self.k_l[il], buf_k.data(), 0, buf_k.size()); + ggml_backend_tensor_get(kv_self.v_l[il], buf_v.data(), 0, buf_v.size()); + + // batch move [i, i+nm) to [id, id+nm) + // note: cells can move only to a lower index + for (uint32_t i = 0; i < n_kv; ++i) { + const uint32_t id = ids[i]; + + if (i == id || id == n_kv) { + continue; + } + + uint32_t nm = 1; + + while (i + nm < n_kv && ids[i + nm] == id + nm) { + nm++; + } + + // move keys + { + const int64_t os = i*k_size_row; + const int64_t od = id*k_size_row; + + memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row); + } + + // move values (note: they are transposed) + { + const int64_t os = i; + const int64_t od = id; + + for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { + memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el); + } + } + + i += nm - 1; + } + + ggml_backend_tensor_set(kv_self.k_l[il], buf_k.data(), 0, buf_k.size()); + ggml_backend_tensor_set(kv_self.v_l[il], buf_v.data(), 0, buf_v.size()); + } +#else + for (uint32_t i = 0; i < ids.size(); ++i) { + const uint32_t id = ids[i]; + + if (i == id || id == ids.size()) { + continue; + } + + uint32_t nm = 1; + + while (i + nm < ids.size() && ids[i + nm] == id + nm) { + nm++; + } + + for (uint32_t il = 0; il < n_layer; ++il) { + const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); + const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); + + ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il], + n_embd_k_gqa, nm, + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i)); + + ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il], + n_embd_k_gqa, nm, + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id)); + + ggml_tensor * view_v_src; + ggml_tensor * view_v_dst; + + if (cparams.flash_attn) { + // NOTE: the V cache is not transposed when using flash attention + view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il], + n_embd_v_gqa, nm, + ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa), + ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i)); + + view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il], + n_embd_v_gqa, nm, + ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa), + ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id)); + } else { + view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il], + nm, n_embd_v_gqa, + ggml_row_size(kv_self.v_l[il]->type, kv_self.size), + ggml_row_size(kv_self.v_l[il]->type, i)); + + view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il], + nm, n_embd_v_gqa, + ggml_row_size(kv_self.v_l[il]->type, kv_self.size), + ggml_row_size(kv_self.v_l[il]->type, id)); + } + + ggml_build_forward_expand(graph, ggml_cpy(ctx0, view_k_src, view_k_dst)); + ggml_build_forward_expand(graph, ggml_cpy(ctx0, view_v_src, view_v_dst)); + } + + i += nm - 1; + } + + //LLAMA_LOG_INFO("graph->n_nodes = %d\n", graph->n_nodes); +#endif +} + +ggml_tensor * llama_context::build_inp_s_copy( + ggml_context * ctx0, + bool worst_case) { + const auto n_kv = worst_case ? kv_self.size : kv_self.n; + + inp_s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_kv); + //cb(inp_s_copy, "inp_s_copy", -1); + ggml_set_input(inp_s_copy); + return inp_s_copy; +} + +ggml_tensor * llama_context::build_inp_s_mask( + ggml_context * ctx0, + bool worst_case) { + const auto n_kv = worst_case ? kv_self.size : kv_self.n; + inp_s_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv); + //cb(inp_s_mask, "inp_s_mask", -1); + ggml_set_input(inp_s_mask); + return inp_s_mask; +} + +ggml_tensor * llama_context::build_copy_mask_state( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * s, + ggml_tensor * state_copy, + ggml_tensor * state_mask, + int32_t n_tokens, + int32_t n_state, + int32_t n_seqs, + bool worst_case) { + const auto n_kv = worst_case ? kv_self.size : kv_self.n; + const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head; + + struct ggml_tensor * states = ggml_reshape_2d(ctx0, s, n_state, kv_self.size); + + // copy states + // NOTE: assuming the copy destinations are ALL contained between kv_head and kv_head + n_kv + // this shrinks the tensors's ne[1] to n_kv + states = ggml_get_rows(ctx0, states, state_copy); + + // clear states of sequences which are starting at the beginning of this batch + // FIXME: zero-out NANs? + states = ggml_mul(ctx0, states, state_mask); + + // copy states which won't be changed further (between n_seqs and n_kv) + ggml_build_forward_expand(graph, + ggml_cpy(ctx0, + ggml_view_1d(ctx0, states, n_state*(n_kv - n_seqs), (n_seqs )*n_state*ggml_element_size(states)), + ggml_view_1d(ctx0, s, n_state*(n_kv - n_seqs), (kv_head + n_seqs)*n_state*ggml_element_size(s)))); + + // the part of the states that will be used and modified + return ggml_view_2d(ctx0, states, n_state, n_seqs, states->nb[1], 0); +} + +// TODO: split +ggml_tensor * llama_context::build_mamba_layer( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * cur, + ggml_tensor * state_copy, + ggml_tensor * state_mask, + const llama_ubatch & ubatch, + int il, + bool worst_case) { + const auto & hparams = model.hparams; + + const auto & n_tokens = ubatch.n_tokens; + + const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head; + + const int64_t d_conv = hparams.ssm_d_conv; + const int64_t d_inner = hparams.ssm_d_inner; + const int64_t d_state = hparams.ssm_d_state; + const int64_t dt_rank = hparams.ssm_dt_rank; + const int64_t n_seqs = ubatch.n_seqs; + // Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers) + const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms; + // Use the same RMS norm as the final layer norm + const float norm_rms_eps = hparams.f_norm_rms_eps; + + const int64_t n_seq_tokens = ubatch.n_seq_tokens; + + GGML_ASSERT(n_seqs != 0); + GGML_ASSERT(ubatch.equal_seqs); + GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); + + struct ggml_tensor * conv_states_all = kv_self.k_l[il]; + struct ggml_tensor * ssm_states_all = kv_self.v_l[il]; + + // (ab)using the KV cache to store the states + struct ggml_tensor * conv = build_copy_mask_state( + ctx0, graph, conv_states_all, state_copy, state_mask, + n_tokens, hparams.n_embd_k_s(), n_seqs, worst_case); + conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs); + struct ggml_tensor * ssm = build_copy_mask_state( + ctx0, graph, ssm_states_all, state_copy, state_mask, + n_tokens, hparams.n_embd_v_s(), n_seqs, worst_case); + ssm = ggml_reshape_3d(ctx0, ssm, d_state, d_inner, n_seqs); + + // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs} + cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs); + + // {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs} + struct ggml_tensor * xz = build_lora_mm(ctx0, model.layers[il].ssm_in, cur); + // split the above in two + // => {d_inner, n_seq_tokens, n_seqs} + struct ggml_tensor * x = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], 0); + struct ggml_tensor * z = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], d_inner*ggml_element_size(xz)); + + // conv + { + // => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs} + struct ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, x), 0); + + // copy last (d_conv - 1) columns back into the state cache + struct ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0])); + + ggml_build_forward_expand(graph, + ggml_cpy(ctx0, last_conv, + ggml_view_1d(ctx0, conv_states_all, + (d_conv - 1)*(d_inner)*(n_seqs), + kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(conv_states_all)))); + + // 1D convolution + // The equivalent is to make a self-overlapping view of conv_x + // over d_conv columns at each stride in the 3rd dimension, + // then element-wise multiply that with the conv1d weight, + // then sum the elements of each row, + // (the last two steps are a dot product over rows (also doable with mul_mat)) + // then permute away the ne[0] dimension, + // and then you're left with the resulting x tensor. + // For simultaneous sequences, all sequences need to have the same length. + x = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d); + + // bias + x = ggml_add(ctx0, x, model.layers[il].ssm_conv1d_b); + + x = ggml_silu(ctx0, x); + } + + // ssm + { + // {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs} + struct ggml_tensor * x_db = build_lora_mm(ctx0, model.layers[il].ssm_x, x); + // split + struct ggml_tensor * dt = ggml_view_3d(ctx0, x_db, dt_rank, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], 0); + struct ggml_tensor * B = ggml_view_3d(ctx0, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*dt_rank); + struct ggml_tensor * C = ggml_view_3d(ctx0, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*(dt_rank+d_state)); + + // Some Mamba variants (e.g. FalconMamba) apply RMS norm in B, C & Dt layers + if (ssm_dt_b_c_rms) { + dt = ggml_rms_norm(ctx0, dt, norm_rms_eps); + B = ggml_rms_norm(ctx0, B, norm_rms_eps); + C = ggml_rms_norm(ctx0, C, norm_rms_eps); + } + + // {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs} + dt = build_lora_mm(ctx0, model.layers[il].ssm_dt, dt); + dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b); + + // Custom operator to optimize the parallel associative scan + // as described in the Annex D of the Mamba paper. + // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs} + struct ggml_tensor * y_ssm = ggml_ssm_scan(ctx0, ssm, x, dt, model.layers[il].ssm_a, B, C); + + // store last states + ggml_build_forward_expand(graph, + ggml_cpy(ctx0, + ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]), + ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all)))); + + struct ggml_tensor * y = ggml_view_3d(ctx0, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[1], x->nb[2], 0); + + // TODO: skip computing output earlier for unused tokens + + // {d_inner, n_seq_tokens, n_seqs} * {d_inner} => {d_inner, n_seq_tokens, n_seqs} + y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d)); + y = ggml_mul(ctx0, y, ggml_silu(ctx0, ggml_cont(ctx0, z))); + + // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs} + cur = build_lora_mm(ctx0, model.layers[il].ssm_out, y); + } + + // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens} + cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs); + //cb(cur, "mamba_out", il); + + return cur; +} + + // llama output size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) { diff --git a/src/llama-context.h b/src/llama-context.h index 73baa711f..a2f41b5c8 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -14,6 +14,8 @@ #include #include +using llama_loras = std::unordered_map; + struct llama_context { llama_context(const llama_model & model) : model(model) @@ -22,12 +24,10 @@ struct llama_context { const struct llama_model & model; - struct llama_cparams cparams; - struct llama_sbatch sbatch; // TODO: revisit if needed - struct llama_kv_cache kv_self; - struct llama_adapter_cvec cvec; - - std::unordered_map lora; + llama_cparams cparams; + llama_sbatch sbatch; // TODO: revisit if needed + llama_adapter_cvec cvec; + llama_loras loras; std::vector backends; std::vector> set_n_threads_fns; @@ -72,18 +72,6 @@ struct llama_context { // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE std::map> embd_seq; - // whether we are computing encoder output or decoder output - bool is_encoding = false; - - // TODO: find a better way to accommodate mutli-dimension position encoding methods - // number of position id each token get, 1 for each token in most cases. - // when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate. - int n_pos_per_token = 1; - - // output of the encoder part of the encoder-decoder models - std::vector embd_enc; - std::vector> seq_ids_enc; - // memory buffers used to evaluate the model std::vector buf_compute_meta; ggml_backend_sched_ptr sched; @@ -91,29 +79,145 @@ struct llama_context { ggml_abort_callback abort_callback = nullptr; void * abort_callback_data = nullptr; + void reset(); + + void prepare_k_shift(); + void prepare_defrag(); + void prepare_decode(const llama_ubatch & ubatch); + + void set_inputs(const llama_ubatch & ubatch); + + ggml_tensor * build_lora_mm( + ggml_context * ctx0, + ggml_tensor * w, + ggml_tensor * cur); + + ggml_tensor * build_lora_mm_id( + ggml_context * ctx0, + ggml_tensor * w, // struct ggml_tensor * as + ggml_tensor * cur, // struct ggml_tensor * b + ggml_tensor * ids); + // input tensors struct ggml_tensor * inp_tokens; // I32 [n_batch] struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch] struct ggml_tensor * inp_pos; // I32 [n_batch] struct ggml_tensor * inp_out_ids; // I32 [n_outputs] - struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch] - struct ggml_tensor * inp_KQ_mask_swa; // F32 [kv_size, n_batch] - struct ggml_tensor * inp_K_shift; // I32 [kv_size] struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch] struct ggml_tensor * inp_cls; // I32 [n_batch] + + // === encoder-decoder === + + // whether we are computing encoder output or decoder output + bool is_encoding = false; + + // output of the encoder part of the encoder-decoder models + std::vector embd_enc; + std::vector> seq_ids_enc; + + struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc] + struct ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch] + + // === unified KV cache === + + llama_kv_cache kv_self; + + struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch] + struct ggml_tensor * inp_KQ_mask_cnv; // [kv_size, n_batch] + struct ggml_tensor * inp_KQ_mask_swa; // F32 [kv_size, n_batch] + struct ggml_tensor * inp_KQ_mask_swa_cnv; // [kv_size, n_batch] + struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch] + struct ggml_tensor * inp_K_shift; // I32 [kv_size] + + void build_attn_inp( + ggml_context * ctx0, + int32_t n_tokens, + bool causal, + bool swa, + bool worst_case); + + void build_attn_kv_store( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * k_cur, + ggml_tensor * v_cur, + int32_t n_tokens, + int64_t il, + bool worst_case); + + ggml_tensor * build_attn_qkv( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * wo, + ggml_tensor * wo_b, + ggml_tensor * q_cur, + int32_t n_tokens, + float kq_scale, + int il, + bool worst_case); + + ggml_tensor * build_soft_max_ext( + ggml_context * ctx0, + ggml_tensor * kq, + float kq_scale); + + ggml_tensor * get_rope_factors(int il); + + void build_k_shift( + ggml_context * ctx0, + ggml_cgraph * graph); + + // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache + void build_defrag( + ggml_context * ctx0, + ggml_cgraph * graph); + + // === recurrent === + + // TODO: add recurrent cache + // TODO: add mamba-specific llama_context + + // TODO: change these to build_mamba_inp and hide `state_copy` and `state_mask` inside the llama_context impl + ggml_tensor * build_inp_s_copy( + ggml_context * ctx0, + bool worst_case); + + ggml_tensor * build_inp_s_mask( + ggml_context * ctx0, + bool worst_case); + + ggml_tensor * build_copy_mask_state( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * s, + ggml_tensor * state_copy, + ggml_tensor * state_mask, + int32_t n_tokens, + int32_t n_state, + int32_t n_seqs, + bool worst_case); + + ggml_tensor * build_mamba_layer( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * cur, + ggml_tensor * state_copy, + ggml_tensor * state_mask, + const llama_ubatch & ubatch, + int il, + bool worst_case); + struct ggml_tensor * inp_s_copy; // I32 [kv_size] struct ggml_tensor * inp_s_mask; // F32 [1, n_kv] - struct ggml_tensor * inp_s_seq; // I32 [n_kv, n_batch] - struct ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch] - struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc] - struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch] - void set_k_shift(llama_kv_cache & kv); + // === vision === + + // TODO: find a better way to accommodate mutli-dimension position encoding methods + // number of position id each token get, 1 for each token in most cases. + // when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate. + int n_pos_per_token = 1; }; -// TODO: make these methods of llama_context -void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch); - // Make sure enough space is available for outputs. // Returns max number of outputs for which space was reserved. size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs); diff --git a/src/llama.cpp b/src/llama.cpp index 6b8212f41..3abc9a0b2 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4,8 +4,6 @@ #include "llama-mmap.h" #include "llama-context.h" #include "llama-vocab.h" -#include "llama-sampling.h" -#include "llama-kv-cache.h" #include "llama-model-loader.h" #include "llama-model.h" @@ -106,946 +104,15 @@ enum llm_norm_type { LLM_NORM_GROUP, }; -static struct ggml_tensor * llm_build_inp_embd( - struct ggml_context * ctx, - struct llama_context & lctx, - const llama_hparams & hparams, - const llama_ubatch & ubatch, - struct ggml_tensor * tok_embd, - const llm_build_cb & cb) { - const int64_t n_embd = hparams.n_embd; - - struct ggml_tensor * inpL; - - if (ubatch.token) { - lctx.inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ubatch.n_tokens); - cb(lctx.inp_tokens, "inp_tokens", -1); - ggml_set_input(lctx.inp_tokens); - - inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens); - - // apply lora for embedding tokens if needed - for (auto & it : lctx.lora) { - struct llama_adapter_lora_weight * lw = it.first->get_weight(tok_embd); - if (lw == nullptr) { - continue; - } - const float adapter_scale = it.second; - const float scale = lw->get_scale(it.first->alpha, adapter_scale); - struct ggml_tensor * inpL_delta = ggml_scale(ctx, ggml_mul_mat( - ctx, lw->b, // non-transposed lora_b - ggml_get_rows(ctx, lw->a, lctx.inp_tokens) - ), scale); - inpL = ggml_add(ctx, inpL, inpL_delta); - } - } else { - lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, ubatch.n_tokens); - inpL = lctx.inp_embd; - ggml_set_input(lctx.inp_embd); - } - - // For Granite architecture - if (hparams.f_embedding_scale != 0.0f) { - inpL = ggml_scale(ctx, inpL, hparams.f_embedding_scale); - } - - cb(inpL, "inp_embd", -1); - - return inpL; -} - -static void llm_build_kv_store( - struct ggml_context * ctx, - const llama_hparams & hparams, - const llama_cparams & cparams, - const llama_kv_cache & kv, - struct ggml_cgraph * graph, - struct ggml_tensor * k_cur, - struct ggml_tensor * v_cur, - int32_t n_tokens, - int32_t kv_head, - const llm_build_cb & cb, - int64_t il) { - const int64_t n_ctx = cparams.n_ctx; - - const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); - const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); - - GGML_ASSERT(kv.size == n_ctx); - - struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa, ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa)*kv_head); - cb(k_cache_view, "k_cache_view", il); - - // note: storing RoPE-ed version of K in the KV cache - ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view)); - - assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens); - - struct ggml_tensor * v_cache_view = nullptr; - - if (cparams.flash_attn) { - v_cache_view = ggml_view_1d(ctx, kv.v_l[il], n_tokens*n_embd_v_gqa, ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa)*kv_head); - } else { - // note: the V cache is transposed when not using flash attention - v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa, - ( n_ctx)*ggml_element_size(kv.v_l[il]), - (kv_head)*ggml_element_size(kv.v_l[il])); - - v_cur = ggml_transpose(ctx, v_cur); - } - cb(v_cache_view, "v_cache_view", il); - - ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view)); -} - -// do mat_mul, while optionally apply lora -static struct ggml_tensor * llm_build_lora_mm( - struct llama_context & lctx, - struct ggml_context * ctx0, - struct ggml_tensor * w, - struct ggml_tensor * cur) { - struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur); - for (auto & it : lctx.lora) { - struct llama_adapter_lora_weight * lw = it.first->get_weight(w); - if (lw == nullptr) { - continue; - } - const float adapter_scale = it.second; - const float scale = lw->get_scale(it.first->alpha, adapter_scale); - struct ggml_tensor * ab_cur = ggml_mul_mat( - ctx0, lw->b, - ggml_mul_mat(ctx0, lw->a, cur) - ); - ab_cur = ggml_scale(ctx0, ab_cur, scale); - res = ggml_add(ctx0, res, ab_cur); - } - return res; -} - -// do mat_mul_id, while optionally apply lora -static struct ggml_tensor * llm_build_lora_mm_id( - struct llama_context & lctx, - struct ggml_context * ctx0, - struct ggml_tensor * w, // struct ggml_tensor * as - struct ggml_tensor * cur, // struct ggml_tensor * b - struct ggml_tensor * ids) { - struct ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids); - for (auto & it : lctx.lora) { - struct llama_adapter_lora_weight * lw = it.first->get_weight(w); - if (lw == nullptr) { - continue; - } - const float alpha = it.first->alpha; - const float rank = (float) lw->b->ne[0]; - const float scale = alpha ? it.second * alpha / rank : it.second; - struct ggml_tensor * ab_cur = ggml_mul_mat_id( - ctx0, lw->b, - ggml_mul_mat_id(ctx0, lw->a, cur, ids), - ids - ); - ab_cur = ggml_scale(ctx0, ab_cur, scale); - res = ggml_add(ctx0, res, ab_cur); - } - return res; -} - -static struct ggml_tensor * llm_build_norm( - struct ggml_context * ctx, - struct ggml_tensor * cur, - const llama_hparams & hparams, - struct ggml_tensor * mw, - struct ggml_tensor * mb, - llm_norm_type type, - const llm_build_cb & cb, - int il) { - switch (type) { - case LLM_NORM: cur = ggml_norm (ctx, cur, hparams.f_norm_eps); break; - case LLM_NORM_RMS: cur = ggml_rms_norm (ctx, cur, hparams.f_norm_rms_eps); break; - case LLM_NORM_GROUP: - { - cur = ggml_reshape_3d(ctx, cur, cur->ne[0], 1, cur->ne[1]); - cur = ggml_group_norm(ctx, cur, hparams.n_norm_groups, hparams.f_norm_group_eps); - cur = ggml_reshape_2d(ctx, cur, cur->ne[0], cur->ne[2]); - } break; - } - - if (mw || mb) { - cb(cur, "norm", il); - } - - if (mw) { - cur = ggml_mul(ctx, cur, mw); - if (mb) { - cb(cur, "norm_w", il); - } - } - - if (mb) { - cur = ggml_add(ctx, cur, mb); - } - - return cur; -} - -static struct ggml_tensor * llm_build_ffn( - struct ggml_context * ctx, - struct llama_context & lctx, - struct ggml_tensor * cur, - struct ggml_tensor * up, - struct ggml_tensor * up_b, - struct ggml_tensor * up_s, - struct ggml_tensor * gate, - struct ggml_tensor * gate_b, - struct ggml_tensor * gate_s, - struct ggml_tensor * down, - struct ggml_tensor * down_b, - struct ggml_tensor * down_s, - struct ggml_tensor * act_scales, - llm_ffn_op_type type_op, - llm_ffn_gate_type type_gate, - const llm_build_cb & cb, - int il) { - struct ggml_tensor * tmp = up ? llm_build_lora_mm(lctx, ctx, up, cur) : cur; - cb(tmp, "ffn_up", il); - - if (up_b) { - tmp = ggml_add(ctx, tmp, up_b); - cb(tmp, "ffn_up_b", il); - } - - if (up_s) { - tmp = ggml_mul(ctx, tmp, up_s); - cb(tmp, "ffn_up_s", il); - } - - if (gate) { - switch (type_gate) { - case LLM_FFN_SEQ: - { - cur = llm_build_lora_mm(lctx, ctx, gate, tmp); - cb(cur, "ffn_gate", il); - } break; - case LLM_FFN_PAR: - { - cur = llm_build_lora_mm(lctx, ctx, gate, cur); - cb(cur, "ffn_gate", il); - } break; - } - - if (gate_b) { - cur = ggml_add(ctx, cur, gate_b); - cb(cur, "ffn_gate_b", il); - } - - if (gate_s) { - cur = ggml_mul(ctx, cur, gate_s); - cb(cur, "ffn_gate_s", il); - } - - } else { - cur = tmp; - } - - switch (type_op) { - case LLM_FFN_SILU: - { - cur = ggml_silu(ctx, cur); - cb(cur, "ffn_silu", il); - } break; - case LLM_FFN_GELU: - { - cur = ggml_gelu(ctx, cur); - cb(cur, "ffn_gelu", il); - if (act_scales != NULL) { - cur = ggml_div(ctx, cur, act_scales); - cb(cur, "ffn_act", il); - } - } break; - case LLM_FFN_RELU: - { - cur = ggml_relu(ctx, cur); - cb(cur, "ffn_relu", il); - } break; - case LLM_FFN_RELU_SQR: - { - cur = ggml_relu(ctx, cur); - cb(cur, "ffn_relu", il); - - cur = ggml_sqr(ctx, cur); - cb(cur, "ffn_sqr(relu)", il); - } break; - case LLM_FFN_SWIGLU: - { - // Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf - int64_t split_point = cur->ne[0] / 2; - struct ggml_tensor * x0 = ggml_cont(ctx, ggml_view_2d(ctx, cur, split_point, cur->ne[1], cur->nb[1], 0)); - struct ggml_tensor * x1 = ggml_cont(ctx, ggml_view_2d(ctx, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur))); - - x0 = ggml_silu(ctx, x0); - cb(cur, "ffn_silu", il); - - cur = ggml_mul(ctx, x0, x1); - cb(cur, "ffn_mul", il); - } break; - } - - if (type_gate == LLM_FFN_PAR) { - cur = ggml_mul(ctx, cur, tmp); - cb(cur, "ffn_gate_par", il); - } - - if (down) { - cur = llm_build_lora_mm(lctx, ctx, down, cur); - } - - if (down_b) { - cb(cur, "ffn_down", il); - } - - if (down_b) { - cur = ggml_add(ctx, cur, down_b); - } - - if (down_s) { - cur = ggml_mul(ctx, cur, down_s); - cb(cur, "ffn_down_s", il); - } - - return cur; -} - -static struct ggml_tensor * llm_build_moe_ffn( - struct ggml_context * ctx, - struct llama_context & lctx, - struct ggml_tensor * cur, - struct ggml_tensor * gate_inp, - struct ggml_tensor * up_exps, - struct ggml_tensor * gate_exps, - struct ggml_tensor * down_exps, - struct ggml_tensor * exp_probs_b, - int64_t n_expert, - int64_t n_expert_used, - llm_ffn_op_type type_op, - bool norm_w, - bool scale_w, - float w_scale, -llama_expert_gating_func_type gating_op, - const llm_build_cb & cb, - int il) { - int64_t n_embd = cur->ne[0]; - int64_t n_tokens = cur->ne[1]; - - ggml_tensor * logits = llm_build_lora_mm(lctx, ctx, gate_inp, cur); // [n_expert, n_tokens] - cb(logits, "ffn_moe_logits", il); - - ggml_tensor * probs = nullptr; - switch (gating_op) { - case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: - { - probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens] - } break; - case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: - { - probs = ggml_sigmoid(ctx, logits); // [n_expert, n_tokens] - } break; - default: - GGML_ABORT("fatal error"); - } - cb(probs, "ffn_moe_probs", il); - - // add experts selection bias - introduced in DeepSeek V3 - // leave probs unbiased as it's later used to get expert weights - ggml_tensor * selection_probs = probs; - if (exp_probs_b != nullptr) { - selection_probs = ggml_add(ctx, probs, exp_probs_b); - cb(selection_probs, "ffn_moe_probs_biased", il); - } - - // select experts - ggml_tensor * selected_experts = ggml_top_k(ctx, selection_probs, n_expert_used); // [n_expert_used, n_tokens] - cb(selected_experts->src[0], "ffn_moe_argsort", il); - cb(selected_experts, "ffn_moe_topk", il); - - ggml_tensor * weights = ggml_get_rows(ctx, - ggml_reshape_3d(ctx, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens] - cb(weights, "ffn_moe_weights", il); - - if (norm_w) { - weights = ggml_reshape_2d(ctx, weights, n_expert_used, n_tokens); - - ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights); // [1, n_tokens] - cb(weights_sum, "ffn_moe_weights_sum", il); - - weights = ggml_div(ctx, weights, weights_sum); // [n_expert_used, n_tokens] - cb(weights, "ffn_moe_weights_norm", il); - - weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens); - } - if (scale_w) { - weights = ggml_scale(ctx, weights, w_scale); - cb(weights, "ffn_moe_weights_scaled", il); - } - - cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens); - ggml_tensor * up = llm_build_lora_mm_id(lctx, ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] - cb(up, "ffn_moe_up", il); - - ggml_tensor * gate = llm_build_lora_mm_id(lctx, ctx, gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] - cb(gate, "ffn_moe_gate", il); - - switch (type_op) { - case LLM_FFN_SILU: - { - gate = ggml_silu(ctx, gate); - cb(gate, "ffn_moe_silu", il); - } break; - case LLM_FFN_GELU: - { - gate = ggml_gelu(ctx, gate); - cb(gate, "ffn_moe_gelu", il); - } break; - default: - GGML_ABORT("fatal error"); - } - - ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens] - cb(par, "ffn_moe_gate_par", il); - - ggml_tensor * experts = llm_build_lora_mm_id(lctx, ctx, down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens] - cb(experts, "ffn_moe_down", il); - - experts = ggml_mul(ctx, experts, weights); - - // aggregate experts - ggml_tensor * moe_out = nullptr; - for (int i = 0; i < n_expert_used; ++i) { - ggml_tensor * cur_expert = ggml_view_2d(ctx, experts, n_embd, n_tokens, - experts->nb[2], i*experts->nb[1]); - - if (i == 0) { - moe_out = cur_expert; - } else { - moe_out = ggml_add(ctx, moe_out, cur_expert); - } - } - - if (n_expert_used == 1) { - // avoid returning a non-contiguous tensor - moe_out = ggml_cont(ctx, moe_out); - } - - return moe_out; -} - -static struct ggml_tensor * llm_build_kqv( - struct ggml_context * ctx, - struct llama_context & lctx, - const llama_kv_cache & kv, - struct ggml_cgraph * graph, - struct ggml_tensor * wo, - struct ggml_tensor * wo_b, - struct ggml_tensor * q_cur, - struct ggml_tensor * kq_mask, - int32_t n_tokens, - int32_t n_kv, - float kq_scale, - const llm_build_cb & cb, - int il) { - const llama_model & model = lctx.model; - const llama_hparams & hparams = lctx.model.hparams; - const llama_cparams & cparams = lctx.cparams; - - const int64_t n_ctx = cparams.n_ctx; - const int64_t n_head = hparams.n_head(il); - const int64_t n_head_kv = hparams.n_head_kv(il); - const int64_t n_embd_head_k = hparams.n_embd_head_k; - const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); - const int64_t n_embd_head_v = hparams.n_embd_head_v; - const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); - - struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3); - cb(q, "q", il); - - struct ggml_tensor * k = - ggml_view_3d(ctx, kv.k_l[il], - n_embd_head_k, n_kv, n_head_kv, - ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa), - ggml_row_size(kv.k_l[il]->type, n_embd_head_k), - 0); - cb(k, "k", il); - - struct ggml_tensor * cur; - - if (cparams.flash_attn) { - GGML_UNUSED(model); - GGML_UNUSED(n_ctx); - - // split cached v into n_head heads (not transposed) - struct ggml_tensor * v = - ggml_view_3d(ctx, kv.v_l[il], - n_embd_head_v, n_kv, n_head_kv, - ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa), - ggml_row_size(kv.v_l[il]->type, n_embd_head_v), - 0); - cb(v, "v", il); - - cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias, - hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f); - - ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); - - cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens); - } else { - struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); - cb(kq, "kq", il); - - // note: this op tends to require high floating point range - // while for some models F16 is enough, for others it is not, so we default to F32 here - ggml_mul_mat_set_prec(kq, GGML_PREC_F32); - - if (model.arch == LLM_ARCH_GROK) { - // need to do the following: - // multiply by attn_output_multiplyer of 0.08838834764831845 - // and then : - // kq = 30 * tanh(kq / 30) - // before the softmax below - - kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f)); - kq = ggml_scale(ctx, kq, 30); - } - - if (hparams.attn_soft_cap) { - kq = ggml_scale(ctx, kq, 1.0f / hparams.f_attn_logit_softcapping); - kq = ggml_tanh(ctx, kq); - kq = ggml_scale(ctx, kq, hparams.f_attn_logit_softcapping); - } - - kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias); - cb(kq, "kq_soft_max_ext", il); - - GGML_ASSERT(kv.size == n_ctx); - - // split cached v into n_head heads - struct ggml_tensor * v = - ggml_view_3d(ctx, kv.v_l[il], - n_kv, n_embd_head_v, n_head_kv, - ggml_element_size(kv.v_l[il])*n_ctx, - ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v, - 0); - cb(v, "v", il); - - struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq); - cb(kqv, "kqv", il); - - struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3); - cb(kqv_merged, "kqv_merged", il); - - cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_v*n_head, n_tokens); - cb(cur, "kqv_merged_cont", il); - } - - ggml_build_forward_expand(graph, cur); - - if (wo) { - cur = llm_build_lora_mm(lctx, ctx, wo, cur); - } - - if (wo_b) { - cb(cur, "kqv_wo", il); - } - - if (wo_b) { - cur = ggml_add(ctx, cur, wo_b); - } - - return cur; -} - -static struct ggml_tensor * llm_build_kv( - struct ggml_context * ctx, - struct llama_context & lctx, - const llama_kv_cache & kv, - struct ggml_cgraph * graph, - struct ggml_tensor * wo, - struct ggml_tensor * wo_b, - struct ggml_tensor * k_cur, - struct ggml_tensor * v_cur, - struct ggml_tensor * q_cur, - struct ggml_tensor * kq_mask, - int32_t n_tokens, - int32_t kv_head, - int32_t n_kv, - float kq_scale, - const llm_build_cb & cb, - int il) { - const llama_hparams & hparams = lctx.model.hparams; - const llama_cparams & cparams = lctx.cparams; - - // these nodes are added to the graph together so that they are not reordered - // by doing so, the number of splits in the graph is reduced - ggml_build_forward_expand(graph, q_cur); - ggml_build_forward_expand(graph, k_cur); - ggml_build_forward_expand(graph, v_cur); - - llm_build_kv_store(ctx, hparams, cparams, kv, graph, k_cur, v_cur, n_tokens, kv_head, cb, il); - - struct ggml_tensor * cur; - - cur = llm_build_kqv(ctx, lctx, kv, graph, wo, wo_b, q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il); - cb(cur, "kqv_out", il); - - return cur; -} - -static struct ggml_tensor * llm_build_copy_mask_state( - struct ggml_context * ctx, - struct ggml_cgraph * graph, - struct ggml_tensor * s, - struct ggml_tensor * state_copy, - struct ggml_tensor * state_mask, - int32_t n_state, - int32_t kv_size, - int32_t kv_head, - int32_t n_kv, - int32_t n_seqs) { - struct ggml_tensor * states = ggml_reshape_2d(ctx, s, n_state, kv_size); - - // copy states - // NOTE: assuming the copy destinations are ALL contained between kv_head and kv_head + n_kv - // this shrinks the tensors's ne[1] to n_kv - states = ggml_get_rows(ctx, states, state_copy); - - // clear states of sequences which are starting at the beginning of this batch - // FIXME: zero-out NANs? - states = ggml_mul(ctx, states, state_mask); - - // copy states which won't be changed further (between n_seqs and n_kv) - ggml_build_forward_expand(graph, - ggml_cpy(ctx, - ggml_view_1d(ctx, states, n_state*(n_kv - n_seqs), n_seqs*n_state*ggml_element_size(states)), - ggml_view_1d(ctx, s, n_state*(n_kv - n_seqs), (kv_head + n_seqs)*n_state*ggml_element_size(s)))); - - // the part of the states that will be used and modified - return ggml_view_2d(ctx, states, n_state, n_seqs, states->nb[1], 0); -} - -// TODO: split -static struct ggml_tensor * llm_build_mamba( - struct ggml_context * ctx, - struct llama_context & lctx, - const llama_ubatch & ubatch, - struct ggml_cgraph * graph, - struct ggml_tensor * cur, - struct ggml_tensor * state_copy, - struct ggml_tensor * state_mask, - int32_t kv_head, - int32_t n_kv, - const llm_build_cb & cb, - int il) { - const llama_model & model = lctx.model; - const llama_hparams & hparams = model.hparams; - const llama_kv_cache & kv = lctx.kv_self; - const int64_t d_conv = hparams.ssm_d_conv; - const int64_t d_inner = hparams.ssm_d_inner; - const int64_t d_state = hparams.ssm_d_state; - const int64_t dt_rank = hparams.ssm_dt_rank; - const int64_t n_seqs = ubatch.n_seqs; - // Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers) - const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms; - // Use the same RMS norm as the final layer norm - const float norm_rms_eps = hparams.f_norm_rms_eps; - - const int64_t n_seq_tokens = ubatch.n_seq_tokens; - - GGML_ASSERT(n_seqs != 0); - GGML_ASSERT(ubatch.equal_seqs); - GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); - - struct ggml_tensor * conv_states_all = kv.k_l[il]; - struct ggml_tensor * ssm_states_all = kv.v_l[il]; - - // (ab)using the KV cache to store the states - struct ggml_tensor * conv = llm_build_copy_mask_state(ctx, - graph, conv_states_all, state_copy, state_mask, - hparams.n_embd_k_s(), kv.size, kv_head, n_kv, n_seqs); - conv = ggml_reshape_3d(ctx, conv, d_conv - 1, d_inner, n_seqs); - struct ggml_tensor * ssm = llm_build_copy_mask_state(ctx, - graph, ssm_states_all, state_copy, state_mask, - hparams.n_embd_v_s(), kv.size, kv_head, n_kv, n_seqs); - ssm = ggml_reshape_3d(ctx, ssm, d_state, d_inner, n_seqs); - - // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs} - cur = ggml_reshape_3d(ctx, cur, cur->ne[0], n_seq_tokens, n_seqs); - - // {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs} - struct ggml_tensor * xz = llm_build_lora_mm(lctx, ctx, model.layers[il].ssm_in, cur); - // split the above in two - // => {d_inner, n_seq_tokens, n_seqs} - struct ggml_tensor * x = ggml_view_3d(ctx, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], 0); - struct ggml_tensor * z = ggml_view_3d(ctx, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], d_inner*ggml_element_size(xz)); - - // conv - { - // => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs} - struct ggml_tensor * conv_x = ggml_concat(ctx, conv, ggml_transpose(ctx, x), 0); - - // copy last (d_conv - 1) columns back into the state cache - struct ggml_tensor * last_conv = ggml_view_3d(ctx, conv_x, d_conv - 1, d_inner, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0])); - - ggml_build_forward_expand(graph, - ggml_cpy(ctx, last_conv, - ggml_view_1d(ctx, conv_states_all, - (d_conv - 1)*(d_inner)*(n_seqs), - kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(conv_states_all)))); - - // 1D convolution - // The equivalent is to make a self-overlapping view of conv_x - // over d_conv columns at each stride in the 3rd dimension, - // then element-wise multiply that with the conv1d weight, - // then sum the elements of each row, - // (the last two steps are a dot product over rows (also doable with mul_mat)) - // then permute away the ne[0] dimension, - // and then you're left with the resulting x tensor. - // For simultaneous sequences, all sequences need to have the same length. - x = ggml_ssm_conv(ctx, conv_x, model.layers[il].ssm_conv1d); - - // bias - x = ggml_add(ctx, x, model.layers[il].ssm_conv1d_b); - - x = ggml_silu(ctx, x); - } - - // ssm - { - // {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs} - struct ggml_tensor * x_db = llm_build_lora_mm(lctx, ctx, model.layers[il].ssm_x, x); - // split - struct ggml_tensor * dt = ggml_view_3d(ctx, x_db, dt_rank, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], 0); - struct ggml_tensor * B = ggml_view_3d(ctx, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*dt_rank); - struct ggml_tensor * C = ggml_view_3d(ctx, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*(dt_rank+d_state)); - - // Some Mamba variants (e.g. FalconMamba) apply RMS norm in B, C & Dt layers - if (ssm_dt_b_c_rms) { - dt = ggml_rms_norm(ctx, dt, norm_rms_eps); - B = ggml_rms_norm(ctx, B, norm_rms_eps); - C = ggml_rms_norm(ctx, C, norm_rms_eps); - } - - // {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs} - dt = llm_build_lora_mm(lctx, ctx, model.layers[il].ssm_dt, dt); - dt = ggml_add(ctx, dt, model.layers[il].ssm_dt_b); - - // Custom operator to optimize the parallel associative scan - // as described in the Annex D of the Mamba paper. - // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs} - struct ggml_tensor * y_ssm = ggml_ssm_scan(ctx, ssm, x, dt, model.layers[il].ssm_a, B, C); - - // store last states - ggml_build_forward_expand(graph, - ggml_cpy(ctx, - ggml_view_1d(ctx, y_ssm, d_state*d_inner*n_seqs, x->nb[3]), - ggml_view_1d(ctx, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all)))); - - struct ggml_tensor * y = ggml_view_3d(ctx, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[1], x->nb[2], 0); - - // TODO: skip computing output earlier for unused tokens - - // {d_inner, n_seq_tokens, n_seqs} * {d_inner} => {d_inner, n_seq_tokens, n_seqs} - y = ggml_add(ctx, y, ggml_mul(ctx, x, model.layers[il].ssm_d)); - y = ggml_mul(ctx, y, ggml_silu(ctx, ggml_cont(ctx, z))); - - // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs} - cur = llm_build_lora_mm(lctx, ctx, model.layers[il].ssm_out, y); - } - - // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens} - cur = ggml_reshape_2d(ctx, cur, cur->ne[0], n_seq_tokens * n_seqs); - cb(cur, "mamba_out", il); - - return cur; -} - -static struct ggml_tensor * llm_build_rwkv6_time_mix( - struct llama_context & lctx, - struct ggml_context * ctx, - const struct llama_layer * layer, - struct ggml_tensor * cur, - struct ggml_tensor * x_prev, - struct ggml_tensor ** wkv_state, - size_t wkv_head_size, - size_t head_count_kv) { - size_t n_embd = cur->ne[0]; - size_t n_seq_tokens = cur->ne[1]; - size_t n_seqs = cur->ne[2]; - - size_t head_size = wkv_head_size; - size_t head_count = n_embd / head_size; - - size_t n_tokens = n_seqs * n_seq_tokens; - - bool is_qrwkv = layer->time_mix_first == nullptr; - - struct ggml_tensor * sx = ggml_sub(ctx, x_prev, cur); - - sx = ggml_reshape_2d(ctx, sx, n_embd, n_tokens); - cur = ggml_reshape_2d(ctx, cur, n_embd, n_tokens); - - struct ggml_tensor * xxx = ggml_add(ctx, ggml_mul(ctx, sx, layer->time_mix_lerp_x), cur); - - xxx = ggml_reshape_4d( - ctx, - ggml_tanh( - ctx, - ggml_mul_mat(ctx, layer->time_mix_w1, xxx) - ), - layer->time_mix_w1->ne[1] / 5, 1, 5, n_tokens - ); - - xxx = ggml_cont(ctx, ggml_permute(ctx, xxx, 0, 1, 3, 2)); - - xxx = ggml_mul_mat( - ctx, - ggml_reshape_4d( - ctx, - layer->time_mix_w2, - layer->time_mix_w2->ne[0], layer->time_mix_w2->ne[1], 1, 5 - ), - xxx - ); - - struct ggml_tensor *xw, *xk, *xv, *xr, *xg; - if (layer->time_mix_lerp_fused) { - // fusing these weights makes some performance improvement - sx = ggml_reshape_3d(ctx, sx, n_embd, 1, n_tokens); - cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens); - xxx = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xxx, layer->time_mix_lerp_fused), sx), cur); - xw = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], 0); - xk = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float)); - xv = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float)); - xr = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float)); - xg = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float)); - } else { - // for backward compatibility - xw = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], 0); - xk = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float)); - xv = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float)); - xr = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float)); - xg = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float)); - - xw = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xw, layer->time_mix_lerp_w), sx), cur); - xk = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xk, layer->time_mix_lerp_k), sx), cur); - xv = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xv, layer->time_mix_lerp_v), sx), cur); - xr = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xr, layer->time_mix_lerp_r), sx), cur); - xg = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xg, layer->time_mix_lerp_g), sx), cur); - } - - struct ggml_tensor * r = llm_build_lora_mm(lctx, ctx, layer->time_mix_receptance, xr); - struct ggml_tensor * k = llm_build_lora_mm(lctx, ctx, layer->time_mix_key, xk); - struct ggml_tensor * v = llm_build_lora_mm(lctx, ctx, layer->time_mix_value, xv); - if (layer->time_mix_receptance_b) { - r = ggml_add(ctx, r, layer->time_mix_receptance_b); - } - if (layer->time_mix_key_b) { - k = ggml_add(ctx, k, layer->time_mix_key_b); - } - if (layer->time_mix_value_b) { - v = ggml_add(ctx, v, layer->time_mix_value_b); - } - - struct ggml_tensor * g = llm_build_lora_mm(lctx, ctx, layer->time_mix_gate, xg); - if (is_qrwkv) { - g = ggml_sigmoid(ctx, g); - } else { - g = ggml_silu(ctx, g); - } - - if (head_count_kv != head_count) { - GGML_ASSERT(head_count % head_count_kv == 0); - k = ggml_reshape_4d(ctx, k, head_size, 1, head_count_kv, n_tokens); - v = ggml_reshape_4d(ctx, v, head_size, 1, head_count_kv, n_tokens); - struct ggml_tensor * tmp = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_size, head_count / head_count_kv, head_count_kv, n_tokens); - k = ggml_repeat(ctx, k, tmp); - v = ggml_repeat(ctx, v, tmp); - } - - k = ggml_reshape_3d(ctx, k, head_size, head_count, n_tokens); - v = ggml_reshape_3d(ctx, v, head_size, head_count, n_tokens); - r = ggml_reshape_3d(ctx, r, head_size, head_count, n_tokens); - - struct ggml_tensor * w = ggml_mul_mat( - ctx, - layer->time_mix_decay_w2, - ggml_tanh( - ctx, - ggml_mul_mat(ctx, layer->time_mix_decay_w1, xw) - ) - ); - - w = ggml_add(ctx, w, layer->time_mix_decay); - w = ggml_exp(ctx, ggml_neg(ctx, ggml_exp(ctx, w))); - w = ggml_reshape_3d(ctx, w, head_size, head_count, n_tokens); - - if (is_qrwkv) { - // k = k * (1 - w) - k = ggml_sub(ctx, k, ggml_mul(ctx, k, w)); - } - - struct ggml_tensor * wkv_output; - if (!layer->time_mix_first) { - wkv_output = ggml_gated_linear_attn(ctx, k, v, r, w, *wkv_state, pow(head_size, -0.5f)); - } else { - wkv_output = ggml_rwkv_wkv6(ctx, k, v, r, layer->time_mix_first, w, *wkv_state); - } - cur = ggml_view_1d(ctx, wkv_output, n_embd * n_tokens, 0); - *wkv_state = ggml_view_1d(ctx, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float)); - - if (!is_qrwkv) { - // group norm with head_count groups - cur = ggml_reshape_3d(ctx, cur, n_embd / head_count, head_count, n_tokens); - cur = ggml_norm(ctx, cur, 64e-5f); - - // Convert back to regular vectors. - cur = ggml_reshape_2d(ctx, cur, n_embd, n_tokens); - cur = ggml_add(ctx, ggml_mul(ctx, cur, layer->time_mix_ln), layer->time_mix_ln_b); - } else { - cur = ggml_reshape_2d(ctx, cur, n_embd, n_tokens); - } - - cur = ggml_mul(ctx, cur, g); - cur = llm_build_lora_mm(lctx, ctx, layer->time_mix_output, cur); - - return ggml_reshape_3d(ctx, cur, n_embd, n_seq_tokens, n_seqs); -} - -static struct ggml_tensor * llm_build_rwkv6_channel_mix( - struct llama_context & lctx, - struct ggml_context * ctx, - const struct llama_layer * layer, - struct ggml_tensor * cur, - struct ggml_tensor * x_prev) { - struct ggml_tensor * sx = ggml_sub(ctx, x_prev, cur); - struct ggml_tensor * xk = ggml_add(ctx, ggml_mul(ctx, sx, layer->channel_mix_lerp_k), cur); - struct ggml_tensor * xr = ggml_add(ctx, ggml_mul(ctx, sx, layer->channel_mix_lerp_r), cur); - - struct ggml_tensor * r = ggml_sigmoid(ctx, llm_build_lora_mm(lctx, ctx, layer->channel_mix_receptance, xr)); - struct ggml_tensor * k = ggml_sqr( - ctx, - ggml_relu( - ctx, - llm_build_lora_mm(lctx, ctx, layer->channel_mix_key, xk) - ) - ); - - return ggml_mul(ctx, r, llm_build_lora_mm(lctx, ctx, layer->channel_mix_value, k)); -} - struct llm_build_context { - const llama_model & model; - llama_context & lctx; - const llama_hparams & hparams; - const llama_cparams & cparams; - const llama_ubatch & ubatch; - const llama_kv_cache & kv_self; + llama_context & lctx; + const llama_model & model; + const llama_hparams & hparams; + const llama_cparams & cparams; + const llama_ubatch & ubatch; + //const llama_kv_cache & kv_self; + const llama_adapter_cvec & cvec; + const llama_loras & loras; const int64_t n_embd; const int64_t n_layer; @@ -1070,12 +137,13 @@ struct llm_build_context { const float norm_rms_eps; const int32_t n_tokens; - const int32_t n_kv; // size of KV cache to consider (n_kv <= kv_self.size) + //const int32_t n_kv; // size of KV cache to consider (n_kv <= kv_self.size) + //const int32_t kv_head; // index of where we store new KV data in the cache const int32_t n_outputs; const int32_t n_outputs_enc; - const int32_t kv_head; // index of where we store new KV data in the cache const int32_t n_ctx_orig; + const bool worst_case; const bool flash_attn; const enum llama_pooling_type pooling_type; @@ -1089,16 +157,18 @@ struct llm_build_context { // TODO: consider making the entire interface noexcept llm_build_context( - llama_context & lctx, - const llama_ubatch & ubatch, - const llm_build_cb & cb, - bool worst_case) : - model (lctx.model), + llama_context & lctx, + const llama_ubatch & ubatch, + const llm_build_cb & cb, + bool worst_case) : lctx (lctx), + model (lctx.model), hparams (model.hparams), cparams (lctx.cparams), ubatch (ubatch), - kv_self (lctx.kv_self), + //kv_self (lctx.kv_self), + cvec (lctx.cvec), + loras (lctx.loras), n_embd (hparams.n_embd), n_layer (hparams.n_layer), n_rot (hparams.n_rot), @@ -1120,11 +190,12 @@ struct llm_build_context { norm_eps (hparams.f_norm_eps), norm_rms_eps (hparams.f_norm_rms_eps), n_tokens (ubatch.n_tokens), - n_kv (worst_case ? kv_self.size : kv_self.n), + //n_kv (worst_case ? kv_self.size : kv_self.n), + //kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head), n_outputs (worst_case ? n_tokens : lctx.n_outputs), n_outputs_enc (worst_case ? n_tokens : lctx.embd_enc.size() / hparams.n_embd), - kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head), n_ctx_orig (cparams.n_ctx_orig_yarn), + worst_case (worst_case), flash_attn (cparams.flash_attn), pooling_type (cparams.pooling_type), rope_type (hparams.rope_type), @@ -1142,21 +213,7 @@ struct llm_build_context { ctx0 = ggml_init(params); - lctx.inp_tokens = nullptr; - lctx.inp_embd = nullptr; - lctx.inp_pos = nullptr; - lctx.inp_out_ids = nullptr; - lctx.inp_KQ_mask = nullptr; - lctx.inp_KQ_mask_swa = nullptr; - lctx.inp_K_shift = nullptr; - lctx.inp_mean = nullptr; - lctx.inp_cls = nullptr; - lctx.inp_s_copy = nullptr; - lctx.inp_s_mask = nullptr; - lctx.inp_s_seq = nullptr; - lctx.inp_pos_bucket = nullptr; - lctx.inp_embd_enc = nullptr; - lctx.inp_KQ_mask_cross = nullptr; + lctx.reset(); } void free() { @@ -1164,125 +221,597 @@ struct llm_build_context { ctx0 = nullptr; } + struct ggml_tensor * build_inp_embd(struct ggml_tensor * tok_embd) { + struct ggml_tensor * inpL; + + if (ubatch.token) { + lctx.inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens); + cb(lctx.inp_tokens, "inp_tokens", -1); + ggml_set_input(lctx.inp_tokens); + + inpL = ggml_get_rows(ctx0, tok_embd, lctx.inp_tokens); + + // apply lora for embedding tokens if needed + for (const auto & lora : loras) { + struct llama_adapter_lora_weight * lw = lora.first->get_weight(tok_embd); + if (lw == nullptr) { + continue; + } + + const float adapter_scale = lora.second; + const float scale = lw->get_scale(lora.first->alpha, adapter_scale); + + struct ggml_tensor * inpL_delta = ggml_scale(ctx0, ggml_mul_mat( + ctx0, lw->b, // non-transposed lora_b + ggml_get_rows(ctx0, lw->a, lctx.inp_tokens) + ), scale); + + inpL = ggml_add(ctx0, inpL, inpL_delta); + } + } else { + lctx.inp_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens); + inpL = lctx.inp_embd; + ggml_set_input(lctx.inp_embd); + } + + // For Granite architecture + if (hparams.f_embedding_scale != 0.0f) { + inpL = ggml_scale(ctx0, inpL, hparams.f_embedding_scale); + } + + cb(inpL, "inp_embd", -1); + + return inpL; + } + + // do mat_mul, while optionally apply lora + struct ggml_tensor * build_lora_mm( + struct ggml_tensor * w, + struct ggml_tensor * cur) { + struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur); + + for (const auto & lora : loras) { + struct llama_adapter_lora_weight * lw = lora.first->get_weight(w); + if (lw == nullptr) { + continue; + } + + const float adapter_scale = lora.second; + const float scale = lw->get_scale(lora.first->alpha, adapter_scale); + + struct ggml_tensor * ab_cur = ggml_mul_mat( + ctx0, lw->b, + ggml_mul_mat(ctx0, lw->a, cur) + ); + + ab_cur = ggml_scale(ctx0, ab_cur, scale); + res = ggml_add(ctx0, res, ab_cur); + } + + return res; + } + + // do mat_mul_id, while optionally apply lora + struct ggml_tensor * build_lora_mm_id( + struct ggml_tensor * w, // struct ggml_tensor * as + struct ggml_tensor * cur, // struct ggml_tensor * b + struct ggml_tensor * ids) { + struct ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids); + for (const auto & lora : loras) { + struct llama_adapter_lora_weight * lw = lora.first->get_weight(w); + if (lw == nullptr) { + continue; + } + + const float alpha = lora.first->alpha; + const float rank = (float) lw->b->ne[0]; + const float scale = alpha ? lora.second * alpha / rank : lora.second; + + struct ggml_tensor * ab_cur = ggml_mul_mat_id( + ctx0, lw->b, + ggml_mul_mat_id(ctx0, lw->a, cur, ids), + ids + ); + + ab_cur = ggml_scale(ctx0, ab_cur, scale); + res = ggml_add(ctx0, res, ab_cur); + } + + return res; + } + + struct ggml_tensor * build_norm( + struct ggml_tensor * cur, + struct ggml_tensor * mw, + struct ggml_tensor * mb, + llm_norm_type type, + int il) { + switch (type) { + case LLM_NORM: cur = ggml_norm (ctx0, cur, hparams.f_norm_eps); break; + case LLM_NORM_RMS: cur = ggml_rms_norm (ctx0, cur, hparams.f_norm_rms_eps); break; + case LLM_NORM_GROUP: + { + cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], 1, cur->ne[1]); + cur = ggml_group_norm(ctx0, cur, hparams.n_norm_groups, hparams.f_norm_group_eps); + cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[2]); + } break; + } + + if (mw || mb) { + cb(cur, "norm", il); + } + + if (mw) { + cur = ggml_mul(ctx0, cur, mw); + if (mb) { + cb(cur, "norm_w", il); + } + } + + if (mb) { + cur = ggml_add(ctx0, cur, mb); + } + + return cur; + } + + struct ggml_tensor * build_ffn( + struct ggml_tensor * cur, + struct ggml_tensor * up, + struct ggml_tensor * up_b, + struct ggml_tensor * up_s, + struct ggml_tensor * gate, + struct ggml_tensor * gate_b, + struct ggml_tensor * gate_s, + struct ggml_tensor * down, + struct ggml_tensor * down_b, + struct ggml_tensor * down_s, + struct ggml_tensor * act_scales, + llm_ffn_op_type type_op, + llm_ffn_gate_type type_gate, + const llm_build_cb & cb, + int il) { + struct ggml_tensor * tmp = up ? build_lora_mm(up, cur) : cur; + cb(tmp, "ffn_up", il); + + if (up_b) { + tmp = ggml_add(ctx0, tmp, up_b); + cb(tmp, "ffn_up_b", il); + } + + if (up_s) { + tmp = ggml_mul(ctx0, tmp, up_s); + cb(tmp, "ffn_up_s", il); + } + + if (gate) { + switch (type_gate) { + case LLM_FFN_SEQ: + { + cur = build_lora_mm(gate, tmp); + cb(cur, "ffn_gate", il); + } break; + case LLM_FFN_PAR: + { + cur = build_lora_mm(gate, cur); + cb(cur, "ffn_gate", il); + } break; + } + + if (gate_b) { + cur = ggml_add(ctx0, cur, gate_b); + cb(cur, "ffn_gate_b", il); + } + + if (gate_s) { + cur = ggml_mul(ctx0, cur, gate_s); + cb(cur, "ffn_gate_s", il); + } + + } else { + cur = tmp; + } + + switch (type_op) { + case LLM_FFN_SILU: + { + cur = ggml_silu(ctx0, cur); + cb(cur, "ffn_silu", il); + } break; + case LLM_FFN_GELU: + { + cur = ggml_gelu(ctx0, cur); + cb(cur, "ffn_gelu", il); + if (act_scales != NULL) { + cur = ggml_div(ctx0, cur, act_scales); + cb(cur, "ffn_act", il); + } + } break; + case LLM_FFN_RELU: + { + cur = ggml_relu(ctx0, cur); + cb(cur, "ffn_relu", il); + } break; + case LLM_FFN_RELU_SQR: + { + cur = ggml_relu(ctx0, cur); + cb(cur, "ffn_relu", il); + + cur = ggml_sqr(ctx0, cur); + cb(cur, "ffn_sqr(relu)", il); + } break; + case LLM_FFN_SWIGLU: + { + // Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf + int64_t split_point = cur->ne[0] / 2; + struct ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0)); + struct ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur))); + + x0 = ggml_silu(ctx0, x0); + cb(cur, "ffn_silu", il); + + cur = ggml_mul(ctx0, x0, x1); + cb(cur, "ffn_mul", il); + } break; + } + + if (type_gate == LLM_FFN_PAR) { + cur = ggml_mul(ctx0, cur, tmp); + cb(cur, "ffn_gate_par", il); + } + + if (down) { + cur = build_lora_mm(down, cur); + } + + if (down_b) { + cb(cur, "ffn_down", il); + } + + if (down_b) { + cur = ggml_add(ctx0, cur, down_b); + } + + if (down_s) { + cur = ggml_mul(ctx0, cur, down_s); + cb(cur, "ffn_down_s", il); + } + + return cur; + } + + struct ggml_tensor * build_moe_ffn( + struct ggml_tensor * cur, + struct ggml_tensor * gate_inp, + struct ggml_tensor * up_exps, + struct ggml_tensor * gate_exps, + struct ggml_tensor * down_exps, + struct ggml_tensor * exp_probs_b, + int64_t n_expert, + int64_t n_expert_used, + llm_ffn_op_type type_op, + bool norm_w, + bool scale_w, + float w_scale, + llama_expert_gating_func_type gating_op, + const llm_build_cb & cb, + int il) { + int64_t n_embd = cur->ne[0]; + int64_t n_tokens = cur->ne[1]; + + ggml_tensor * logits = build_lora_mm(gate_inp, cur); // [n_expert, n_tokens] + cb(logits, "ffn_moe_logits", il); + + ggml_tensor * probs = nullptr; + switch (gating_op) { + case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: + { + probs = ggml_soft_max(ctx0, logits); // [n_expert, n_tokens] + } break; + case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: + { + probs = ggml_sigmoid(ctx0, logits); // [n_expert, n_tokens] + } break; + default: + GGML_ABORT("fatal error"); + } + cb(probs, "ffn_moe_probs", il); + + // add experts selection bias - introduced in DeepSeek V3 + // leave probs unbiased as it's later used to get expert weights + ggml_tensor * selection_probs = probs; + if (exp_probs_b != nullptr) { + selection_probs = ggml_add(ctx0, probs, exp_probs_b); + cb(selection_probs, "ffn_moe_probs_biased", il); + } + + // select experts + ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens] + cb(selected_experts->src[0], "ffn_moe_argsort", il); + cb(selected_experts, "ffn_moe_topk", il); + + ggml_tensor * weights = ggml_get_rows(ctx0, + ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens] + cb(weights, "ffn_moe_weights", il); + + if (norm_w) { + weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); + + ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); // [1, n_tokens] + cb(weights_sum, "ffn_moe_weights_sum", il); + + weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_used, n_tokens] + cb(weights, "ffn_moe_weights_norm", il); + + weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens); + } + if (scale_w) { + weights = ggml_scale(ctx0, weights, w_scale); + cb(weights, "ffn_moe_weights_scaled", il); + } + + cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens); + ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] + cb(up, "ffn_moe_up", il); + + ggml_tensor * gate = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] + cb(gate, "ffn_moe_gate", il); + + switch (type_op) { + case LLM_FFN_SILU: + { + gate = ggml_silu(ctx0, gate); + cb(gate, "ffn_moe_silu", il); + } break; + case LLM_FFN_GELU: + { + gate = ggml_gelu(ctx0, gate); + cb(gate, "ffn_moe_gelu", il); + } break; + default: + GGML_ABORT("fatal error"); + } + + ggml_tensor * par = ggml_mul(ctx0, up, gate); // [n_ff, n_expert_used, n_tokens] + cb(par, "ffn_moe_gate_par", il); + + ggml_tensor * experts = build_lora_mm_id(down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens] + cb(experts, "ffn_moe_down", il); + + experts = ggml_mul(ctx0, experts, weights); + + // aggregate experts + ggml_tensor * moe_out = nullptr; + for (int i = 0; i < n_expert_used; ++i) { + ggml_tensor * cur_expert = ggml_view_2d(ctx0, experts, n_embd, n_tokens, + experts->nb[2], i*experts->nb[1]); + + if (i == 0) { + moe_out = cur_expert; + } else { + moe_out = ggml_add(ctx0, moe_out, cur_expert); + } + } + + if (n_expert_used == 1) { + // avoid returning a non-contiguous tensor + moe_out = ggml_cont(ctx0, moe_out); + } + + return moe_out; + } + + struct ggml_tensor * build_attn( + struct ggml_cgraph * graph, + struct ggml_tensor * wo, + struct ggml_tensor * wo_b, + struct ggml_tensor * k_cur, + struct ggml_tensor * v_cur, + struct ggml_tensor * q_cur, + int32_t n_tokens, + float kq_scale, + const llm_build_cb & cb, + int il) { + // these nodes are added to the graph together so that they are not reordered + // by doing so, the number of splits in the graph is reduced + ggml_build_forward_expand(graph, q_cur); + ggml_build_forward_expand(graph, k_cur); + ggml_build_forward_expand(graph, v_cur); + + //build_kv_store(graph, k_cur, v_cur, il); + lctx.build_attn_kv_store(ctx0, graph, k_cur, v_cur, n_tokens, il, worst_case); + + struct ggml_tensor * cur; + + //cur = build_kqv(graph, wo, wo_b, q_cur, kq_mask, kq_scale, il); + cur = lctx.build_attn_qkv(ctx0, graph, wo, wo_b, q_cur, n_tokens, kq_scale, il, worst_case); + cb(cur, "kqv_out", il); + + return cur; + } + + //struct ggml_tensor * build_rwkv6_time_mix( + // const struct llama_layer * layer, + // struct ggml_tensor * cur, + // struct ggml_tensor * x_prev, + // struct ggml_tensor ** wkv_state, + // size_t wkv_head_size, + // size_t head_count_kv) { + // size_t n_embd = cur->ne[0]; + // size_t n_seq_tokens = cur->ne[1]; + // size_t n_seqs = cur->ne[2]; + + // size_t head_size = wkv_head_size; + // size_t head_count = n_embd / head_size; + + // size_t n_tokens = n_seqs * n_seq_tokens; + + // bool is_qrwkv = layer->time_mix_first == nullptr; + + // struct ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur); + + // sx = ggml_reshape_2d(ctx0, sx, n_embd, n_tokens); + // cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); + + // struct ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->time_mix_lerp_x), cur); + + // xxx = ggml_reshape_4d( + // ctx0, + // ggml_tanh( + // ctx0, + // ggml_mul_mat(ctx0, layer->time_mix_w1, xxx) + // ), + // layer->time_mix_w1->ne[1] / 5, 1, 5, n_tokens + // ); + + // xxx = ggml_cont(ctx0, ggml_permute(ctx0, xxx, 0, 1, 3, 2)); + + // xxx = ggml_mul_mat( + // ctx0, + // ggml_reshape_4d( + // ctx0, + // layer->time_mix_w2, + // layer->time_mix_w2->ne[0], layer->time_mix_w2->ne[1], 1, 5 + // ), + // xxx + // ); + + // struct ggml_tensor *xw, *xk, *xv, *xr, *xg; + // if (layer->time_mix_lerp_fused) { + // // fusing these weights makes some performance improvement + // sx = ggml_reshape_3d(ctx0, sx, n_embd, 1, n_tokens); + // cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens); + // xxx = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xxx, layer->time_mix_lerp_fused), sx), cur); + // xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0); + // xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float)); + // xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float)); + // xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float)); + // xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float)); + // } else { + // // for backward compatibility + // xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0); + // xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float)); + // xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float)); + // xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float)); + // xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float)); + + // xw = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xw, layer->time_mix_lerp_w), sx), cur); + // xk = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xk, layer->time_mix_lerp_k), sx), cur); + // xv = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xv, layer->time_mix_lerp_v), sx), cur); + // xr = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xr, layer->time_mix_lerp_r), sx), cur); + // xg = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xg, layer->time_mix_lerp_g), sx), cur); + // } + + // struct ggml_tensor * r = build_lora_mm(layer->time_mix_receptance, xr); + // struct ggml_tensor * k = build_lora_mm(layer->time_mix_key, xk); + // struct ggml_tensor * v = build_lora_mm(layer->time_mix_value, xv); + // if (layer->time_mix_receptance_b) { + // r = ggml_add(ctx0, r, layer->time_mix_receptance_b); + // } + // if (layer->time_mix_key_b) { + // k = ggml_add(ctx0, k, layer->time_mix_key_b); + // } + // if (layer->time_mix_value_b) { + // v = ggml_add(ctx0, v, layer->time_mix_value_b); + // } + + // struct ggml_tensor * g = build_lora_mm(layer->time_mix_gate, xg); + // if (is_qrwkv) { + // g = ggml_sigmoid(ctx0, g); + // } else { + // g = ggml_silu(ctx0, g); + // } + + // if (head_count_kv != head_count) { + // GGML_ASSERT(head_count % head_count_kv == 0); + // k = ggml_reshape_4d(ctx0, k, head_size, 1, head_count_kv, n_tokens); + // v = ggml_reshape_4d(ctx0, v, head_size, 1, head_count_kv, n_tokens); + // struct ggml_tensor * tmp = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, head_size, head_count / head_count_kv, head_count_kv, n_tokens); + // k = ggml_repeat(ctx0, k, tmp); + // v = ggml_repeat(ctx0, v, tmp); + // } + + // k = ggml_reshape_3d(ctx0, k, head_size, head_count, n_tokens); + // v = ggml_reshape_3d(ctx0, v, head_size, head_count, n_tokens); + // r = ggml_reshape_3d(ctx0, r, head_size, head_count, n_tokens); + + // struct ggml_tensor * w = ggml_mul_mat( + // ctx0, + // layer->time_mix_decay_w2, + // ggml_tanh( + // ctx0, + // ggml_mul_mat(ctx0, layer->time_mix_decay_w1, xw) + // ) + // ); + + // w = ggml_add(ctx0, w, layer->time_mix_decay); + // w = ggml_exp(ctx0, ggml_neg(ctx0, ggml_exp(ctx0, w))); + // w = ggml_reshape_3d(ctx0, w, head_size, head_count, n_tokens); + + // if (is_qrwkv) { + // // k = k * (1 - w) + // k = ggml_sub(ctx0, k, ggml_mul(ctx0, k, w)); + // } + + // struct ggml_tensor * wkv_output; + // if (!layer->time_mix_first) { + // wkv_output = ggml_gated_linear_attn(ctx0, k, v, r, w, *wkv_state, pow(head_size, -0.5f)); + // } else { + // wkv_output = ggml_rwkv_wkv6(ctx0, k, v, r, layer->time_mix_first, w, *wkv_state); + // } + // cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0); + // *wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float)); + + // if (!is_qrwkv) { + // // group norm with head_count groups + // cur = ggml_reshape_3d(ctx0, cur, n_embd / head_count, head_count, n_tokens); + // cur = ggml_norm(ctx0, cur, 64e-5f); + + // // Convert back to regular vectors. + // cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); + // cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer->time_mix_ln), layer->time_mix_ln_b); + // } else { + // cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); + // } + + // cur = ggml_mul(ctx0, cur, g); + // cur = build_lora_mm(layer->time_mix_output, cur); + + // return ggml_reshape_3d(ctx0, cur, n_embd, n_seq_tokens, n_seqs); + //} + + //struct ggml_tensor * build_rwkv6_channel_mix( + // const struct llama_layer * layer, + // struct ggml_tensor * cur, + // struct ggml_tensor * x_prev) { + // struct ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur); + // struct ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur); + // struct ggml_tensor * xr = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_r), cur); + + // struct ggml_tensor * r = ggml_sigmoid(ctx0, build_lora_mm(layer->channel_mix_receptance, xr)); + // struct ggml_tensor * k = ggml_sqr( + // ctx0, + // ggml_relu( + // ctx0, + // build_lora_mm(layer->channel_mix_key, xk) + // ) + // ); + + // return ggml_mul(ctx0, r, build_lora_mm(layer->channel_mix_value, k)); + //} + struct ggml_cgraph * build_k_shift() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - GGML_ASSERT(kv_self.size == n_ctx); - - lctx.inp_K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); - cb(lctx.inp_K_shift, "K_shift", -1); - ggml_set_input(lctx.inp_K_shift); - - for (int il = 0; il < n_layer; ++il) { - const int64_t n_head_kv = hparams.n_head_kv(il); - const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); - - struct ggml_tensor * rope_factors = build_rope_factors(il); - - struct ggml_tensor * k = - ggml_view_3d(ctx0, kv_self.k_l[il], - n_embd_head_k, n_head_kv, n_ctx, - ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), - 0); - - struct ggml_tensor * tmp; - if (ggml_is_quantized(k->type)) { - // dequantize to f32 -> RoPE -> quantize back - tmp = ggml_cast(ctx0, k, GGML_TYPE_F32); - cb(tmp, "K_f32", il); - - for (auto & backend : lctx.backends) { - // Figure out which backend KV cache belongs to - if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(kv_self.k_l[il]->buffer))) { - ggml_backend_sched_set_tensor_backend(lctx.sched.get(), tmp, backend.get()); - break; - } - } - tmp = ggml_rope_ext_inplace(ctx0, tmp, - lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow); - cb(tmp, "K_shifted_f32", il); - - tmp = ggml_cpy(ctx0, tmp, k); - } else { - // we rotate only the first n_rot dimensions - tmp = ggml_rope_ext_inplace(ctx0, k, - lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow); - } - cb(tmp, "K_shifted", il); - - ggml_build_forward_expand(gf, tmp); - } + lctx.build_k_shift(ctx0, gf); return gf; } - struct ggml_cgraph * build_defrag(const std::vector & ids) { + struct ggml_cgraph * build_defrag() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - for (uint32_t i = 0; i < ids.size(); ++i) { - const uint32_t id = ids[i]; - - if (i == id || id == ids.size()) { - continue; - } - - uint32_t nm = 1; - - while (i + nm < ids.size() && ids[i + nm] == id + nm) { - nm++; - } - - for (int il = 0; il < n_layer; ++il) { - const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); - const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); - - ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il], - n_embd_k_gqa, nm, - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i)); - - ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il], - n_embd_k_gqa, nm, - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id)); - - ggml_tensor * view_v_src; - ggml_tensor * view_v_dst; - - if (flash_attn) { - // NOTE: the V cache is not transposed when using flash attention - view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il], - n_embd_v_gqa, nm, - ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa), - ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i)); - - view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il], - n_embd_v_gqa, nm, - ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa), - ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id)); - } else { - view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il], - nm, n_embd_v_gqa, - ggml_row_size(kv_self.v_l[il]->type, kv_self.size), - ggml_row_size(kv_self.v_l[il]->type, i)); - - view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il], - nm, n_embd_v_gqa, - ggml_row_size(kv_self.v_l[il]->type, kv_self.size), - ggml_row_size(kv_self.v_l[il]->type, id)); - } - - ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst)); - } - - i += nm - 1; - } - - //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes); + lctx.build_defrag(ctx0, gf); return gf; } @@ -1294,21 +823,6 @@ struct llm_build_context { return lctx.inp_pos; } - struct ggml_tensor * build_rope_factors(int il) { - // choose long/short freq factors based on the context size - const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max; - - if (model.layers[il].rope_freqs != nullptr) { - return model.layers[il].rope_freqs; - } - - if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) { - return model.layers[il].rope_long; - } - - return model.layers[il].rope_short; - } - struct ggml_tensor * build_inp_out_ids() { lctx.inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs); cb(lctx.inp_out_ids, "inp_out_ids", -1); @@ -1316,28 +830,6 @@ struct llm_build_context { return lctx.inp_out_ids; } - struct ggml_tensor * build_inp_KQ_mask(bool causal = true) { - lctx.inp_KQ_mask = causal - ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)) - : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); - cb(lctx.inp_KQ_mask, "KQ_mask", -1); - ggml_set_input(lctx.inp_KQ_mask); - - return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask; - } - - struct ggml_tensor * build_inp_KQ_mask_swa(bool causal = true) { - GGML_ASSERT(hparams.n_swa > 0); - - lctx.inp_KQ_mask_swa = causal - ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)) - : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); - cb(lctx.inp_KQ_mask_swa, "KQ_mask_swa", -1); - ggml_set_input(lctx.inp_KQ_mask_swa); - - return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask_swa, GGML_TYPE_F16) : lctx.inp_KQ_mask_swa; - } - struct ggml_tensor * build_inp_mean() { lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens); cb(lctx.inp_mean, "inp_mean", -1); @@ -1352,20 +844,6 @@ struct llm_build_context { return lctx.inp_cls; } - struct ggml_tensor * build_inp_s_copy() { - lctx.inp_s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_kv); - cb(lctx.inp_s_copy, "inp_s_copy", -1); - ggml_set_input(lctx.inp_s_copy); - return lctx.inp_s_copy; - } - - struct ggml_tensor * build_inp_s_mask() { - lctx.inp_s_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv); - cb(lctx.inp_s_mask, "inp_s_mask", -1); - ggml_set_input(lctx.inp_s_mask); - return lctx.inp_s_mask; - } - struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) { // find result_norm tensor for input struct ggml_tensor * inp = nullptr; @@ -1431,37 +909,37 @@ struct llm_build_context { return gf; } - struct ggml_tensor * build_pos_bucket(bool causal) { - if (causal) { - lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv, n_tokens); - } else { - lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_tokens); - } + //struct ggml_tensor * build_pos_bucket(bool causal) { + // if (causal) { + // lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv, n_tokens); + // } else { + // lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_tokens); + // } - ggml_set_input(lctx.inp_pos_bucket); - cb(lctx.inp_pos_bucket, "pos_bucket", -1); + // ggml_set_input(lctx.inp_pos_bucket); + // cb(lctx.inp_pos_bucket, "pos_bucket", -1); - return lctx.inp_pos_bucket; - } + // return lctx.inp_pos_bucket; + //} - struct ggml_tensor * build_pos_bias(struct ggml_tensor * pos_bucket, struct ggml_tensor * attn_rel_b) { - struct ggml_tensor * pos_bucket_1d = ggml_view_1d(ctx0, pos_bucket, pos_bucket->ne[0] * pos_bucket->ne[1], 0); - cb(pos_bucket_1d, "pos_bucket_1d", -1); + //struct ggml_tensor * build_pos_bias(struct ggml_tensor * pos_bucket, struct ggml_tensor * attn_rel_b) { + // struct ggml_tensor * pos_bucket_1d = ggml_view_1d(ctx0, pos_bucket, pos_bucket->ne[0] * pos_bucket->ne[1], 0); + // cb(pos_bucket_1d, "pos_bucket_1d", -1); - struct ggml_tensor * pos_bias = ggml_get_rows(ctx0, attn_rel_b, pos_bucket_1d); - cb(pos_bias, "pos_bias", -1); + // struct ggml_tensor * pos_bias = ggml_get_rows(ctx0, attn_rel_b, pos_bucket_1d); + // cb(pos_bias, "pos_bias", -1); - pos_bias = ggml_view_3d(ctx0, pos_bias, pos_bias->ne[0], lctx.inp_pos_bucket->ne[0], lctx.inp_pos_bucket->ne[1], ggml_element_size(pos_bias) * pos_bias->ne[0], ggml_element_size(pos_bias) * pos_bias->ne[0] * lctx.inp_pos_bucket->ne[0], 0); - cb(pos_bias, "pos_bias", -1); + // pos_bias = ggml_view_3d(ctx0, pos_bias, pos_bias->ne[0], lctx.inp_pos_bucket->ne[0], lctx.inp_pos_bucket->ne[1], ggml_element_size(pos_bias) * pos_bias->ne[0], ggml_element_size(pos_bias) * pos_bias->ne[0] * lctx.inp_pos_bucket->ne[0], 0); + // cb(pos_bias, "pos_bias", -1); - pos_bias = ggml_permute(ctx0, pos_bias, 2, 0, 1, 3); - cb(pos_bias, "pos_bias", -1); + // pos_bias = ggml_permute(ctx0, pos_bias, 2, 0, 1, 3); + // cb(pos_bias, "pos_bias", -1); - pos_bias = ggml_cont(ctx0, pos_bias); - cb(pos_bias, "pos_bias", -1); + // pos_bias = ggml_cont(ctx0, pos_bias); + // cb(pos_bias, "pos_bias", -1); - return pos_bias; - } + // return pos_bias; + //} struct ggml_tensor * build_inp_embd_enc() { const int64_t n_embd = hparams.n_embd; @@ -1491,45 +969,44 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { // rope freq factors for llama3; may return nullptr for llama2 and other models - struct ggml_tensor * rope_factors = build_rope_factors(il); + struct ggml_tensor * rope_factors = lctx.get_rope_factors(il); // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -1550,9 +1027,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); + Kcur, Vcur, Qcur, n_tokens, kq_scale, cb, il); } if (il == n_layer - 1) { @@ -1574,12 +1051,12 @@ struct llm_build_context { // feed-forward network if (model.layers[il].ffn_gate_inp == nullptr) { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -1588,12 +1065,12 @@ struct llm_build_context { cb(cur, "ffn_out", il); } else { // MoE branch - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_moe_ffn(ctx0, lctx, cur, + cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -1615,7 +1092,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -1624,13 +1101,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); // For Granite architecture if (hparams.f_logit_scale) { @@ -1657,13 +1134,12 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; for (int il = 0; il < n_layer; ++il) { @@ -1676,37 +1152,37 @@ struct llm_build_context { cur = inpL; } else { // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); } if (n_head > 0 && n_head_kv == 0) { // "linear attention" of Llama-3_1-Nemotron-51B - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur); + cur = build_lora_mm(model.layers[il].wo, cur); cb(cur, "wo", il); } else if (n_head > 0) { // self-attention // rope freq factors for llama3; may return nullptr for llama2 and other models - struct ggml_tensor * rope_factors = build_rope_factors(il); + struct ggml_tensor * rope_factors = lctx.get_rope_factors(il); // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -1727,9 +1203,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); + Kcur, Vcur, Qcur, n_tokens, kq_scale, cb, il); } if (il == n_layer - 1) { @@ -1754,12 +1230,12 @@ struct llm_build_context { // feed-forward network if (model.layers[il].ffn_gate_inp == nullptr) { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -1776,7 +1252,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -1785,13 +1261,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); // For Granite architecture if (hparams.f_logit_scale) { @@ -1815,31 +1291,30 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr; - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); switch (model.type) { @@ -1865,9 +1340,9 @@ struct llm_build_context { cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -1882,12 +1357,12 @@ struct llm_build_context { // feed-forward network { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -1897,7 +1372,7 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -1906,13 +1381,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -1930,31 +1405,30 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Qcur = ggml_rope_ext( @@ -1970,9 +1444,9 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -1987,12 +1461,12 @@ struct llm_build_context { // feed-forward network { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -2002,7 +1476,7 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -2011,11 +1485,11 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM_RMS, cb, -1); + cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -2034,37 +1508,36 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * attn_norm; - attn_norm = llm_build_norm(ctx0, inpL, hparams, + attn_norm = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(attn_norm, "attn_norm", il); // self-attention { if (model.layers[il].attn_norm_2) { // Falcon-40B - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "attn_norm_2", il); } else { cur = attn_norm; } - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); + cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); @@ -2091,9 +1564,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -2108,7 +1581,7 @@ struct llm_build_context { // feed forward { - cur = llm_build_ffn(ctx0, lctx, attn_norm, // !! use the attn norm, not the result + cur = build_ffn(attn_norm, // !! use the attn norm, not the result model.layers[il].ffn_up, NULL, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -2119,7 +1592,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cur = ggml_add(ctx0, cur, inpL); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -2129,13 +1602,13 @@ struct llm_build_context { cur = inpL; // norm - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, model.output_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -2156,7 +1629,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // multiply by embedding_multiplier_scale of 78.38367176906169 inpL = ggml_scale(ctx0, inpL, 78.38367176906169f); @@ -2164,37 +1637,36 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -2215,9 +1687,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f, cb, il); } if (il == n_layer - 1) { @@ -2231,9 +1703,9 @@ struct llm_build_context { // Grok // if attn_out_norm is present then apply it before adding the input if (model.layers[il].attn_out_norm) { - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.layers[il].attn_out_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_out_norm", il); } @@ -2242,12 +1714,12 @@ struct llm_build_context { // feed-forward network // MoE branch - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_moe_ffn(ctx0, lctx, cur, + cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -2264,16 +1736,16 @@ struct llm_build_context { // if layer_out_norm is present then apply it before adding the input // Idea: maybe ffn_out_norm is a better name if (model.layers[il].layer_out_norm) { - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.layers[il].layer_out_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "layer_out_norm", il); } cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -2282,13 +1754,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); // Grok // multiply logits by output_multiplier_scale of 0.5773502691896257 @@ -2316,21 +1788,20 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, - model.layers[il].attn_norm, NULL, - LLM_NORM, cb, il); + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM, il); cb(cur, "attn_norm", il); // self-attention @@ -2339,7 +1810,7 @@ struct llm_build_context { struct ggml_tensor * Kcur = nullptr; struct ggml_tensor * Vcur = nullptr; - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); + cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); @@ -2367,9 +1838,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -2385,12 +1856,12 @@ struct llm_build_context { // feed-forward network // MoE branch - cur = llm_build_norm(ctx0, ffn_inp, hparams, - model.layers[il].attn_out_norm, NULL, - LLM_NORM, cb, il); + cur = build_norm(ffn_inp, + model.layers[il].attn_out_norm, NULL, + LLM_NORM, il); cb(cur, "attn_out_norm", il); - cur = llm_build_moe_ffn(ctx0, lctx, cur, + cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -2406,7 +1877,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -2415,13 +1886,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, - model.output_norm, NULL, - LLM_NORM, cb, -1); + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); @@ -2440,13 +1911,12 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); cb(pos, "pos_embd", -1); @@ -2455,15 +1925,15 @@ struct llm_build_context { cb(inpL, "inpL", -1); for (int il = 0; il < n_layer; ++il) { - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "attn_norm", il); // self-attention { - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); + cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -2479,9 +1949,9 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -2497,13 +1967,13 @@ struct llm_build_context { // FF { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -2513,20 +1983,20 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer inpL = cur; } - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.output_norm, model.output_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -2543,28 +2013,27 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); @@ -2573,9 +2042,9 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); cb(Qcur, "Qcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -2590,12 +2059,12 @@ struct llm_build_context { // feed-forward network { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -2605,7 +2074,7 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -2614,13 +2083,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -2645,7 +2114,7 @@ struct llm_build_context { } // construct input embeddings (token, type, position) - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // token types are hardcoded to zero ("Sentence A") struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0); @@ -2656,11 +2125,10 @@ struct llm_build_context { cb(inpL, "inp_embd", -1); // embed layer norm - inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1); + inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1); cb(inpL, "inp_norm", -1); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(false); + lctx.build_attn_inp(ctx0, n_tokens, false, false, worst_case); // iterate layers for (int il = 0; il < n_layer; ++il) { @@ -2672,33 +2140,33 @@ struct llm_build_context { // self-attention if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) { - Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur), model.layers[il].bq); + Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq); cb(Qcur, "Qcur", il); if (model.layers[il].attn_q_norm) { - Qcur = llm_build_norm(ctx0, Qcur, hparams, + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, model.layers[il].attn_q_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); } - Kcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur), model.layers[il].bk); + Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk); cb(Kcur, "Kcur", il); if (model.layers[il].attn_k_norm) { - Kcur = llm_build_norm(ctx0, Kcur, hparams, + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, model.layers[il].attn_k_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); } - Vcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur), model.layers[il].bv); + Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv); cb(Vcur, "Vcur", il); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); } else { // compute Q and K and RoPE them - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); + cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); @@ -2730,7 +2198,8 @@ struct llm_build_context { struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); cb(kq, "kq", il); - kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias); + //kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias); + kq = lctx.build_soft_max_ext(ctx0, kq, 1.0f/sqrtf(float(n_embd_head))); cb(kq, "kq_soft_max_ext", il); struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens))); @@ -2747,7 +2216,7 @@ struct llm_build_context { ggml_build_forward_expand(gf, cur); - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur); + cur = build_lora_mm(model.layers[il].wo, cur); if (model.layers[il].bo) { cb(cur, "kqv_wo", il); } @@ -2768,11 +2237,11 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, inpL); // attention layer norm - cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il); + cur = build_norm(cur, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, il); if (model.layers[il].attn_norm_2 != nullptr) { cur = ggml_add(ctx0, cur, inpL); // re-add the layer input - cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, cb, il); + cur = build_norm(cur, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, il); } struct ggml_tensor * ffn_inp = cur; @@ -2780,21 +2249,21 @@ struct llm_build_context { // feed-forward network if (model.arch == LLM_ARCH_BERT) { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); } else if (model.arch == LLM_ARCH_JINA_BERT_V2) { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, LLM_FFN_GELU, LLM_FFN_PAR, cb, il); } else { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -2807,7 +2276,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); // output layer norm - cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, cb, il); + cur = build_norm(cur, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, il); // input for next layer inpL = cur; @@ -2832,27 +2301,26 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); - inpL = llm_build_norm(ctx0, inpL, hparams, + inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(inpL, "inp_norm", -1); for (int il = 0; il < n_layer; ++il) { - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "attn_norm", il); // self-attention { - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); + cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -2868,9 +2336,9 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -2886,13 +2354,13 @@ struct llm_build_context { // FF { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -2902,20 +2370,20 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer inpL = cur; } - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.output_norm, model.output_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -2934,10 +2402,9 @@ struct llm_build_context { struct ggml_tensor * pos; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); if (model.pos_embd) { // inp_pos - contains the positions @@ -2952,17 +2419,17 @@ struct llm_build_context { for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * attn_norm; - attn_norm = llm_build_norm(ctx0, inpL, hparams, + attn_norm = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(attn_norm, "attn_norm", il); // self-attention { cur = attn_norm; - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); + cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); if (model.layers[il].bqkv){ @@ -2985,30 +2452,30 @@ struct llm_build_context { // Q/K Layernorm if (model.layers[il].attn_q_norm) { - Qcur = llm_build_norm(ctx0, Qcur, hparams, + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, model.layers[il].attn_q_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(Qcur, "Qcur", il); - Kcur = llm_build_norm(ctx0, Kcur, hparams, + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, model.layers[il].attn_k_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(Kcur, "Kcur", il); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } else { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } } @@ -3025,12 +2492,12 @@ struct llm_build_context { // feed forward { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -3040,7 +2507,7 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -3049,13 +2516,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, model.output_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -3072,22 +2539,21 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "attn_norm", il); struct ggml_tensor * inpSA = cur; @@ -3095,21 +2561,21 @@ struct llm_build_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -3122,17 +2588,17 @@ struct llm_build_context { cb(Kcur, "Kcur", il); if (model.layers[il].attn_q_norm) { - Qcur = llm_build_norm(ctx0, Qcur, hparams, + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, - LLM_NORM, cb, il); + LLM_NORM, il); cb(Qcur, "Qcur", il); } if (model.layers[il].attn_k_norm) { - Kcur = llm_build_norm(ctx0, Kcur, hparams, + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, - LLM_NORM, cb, il); + LLM_NORM, il); cb(Kcur, "Kcur", il); } @@ -3151,9 +2617,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -3170,16 +2636,16 @@ struct llm_build_context { // feed-forward network { if (model.layers[il].ffn_norm) { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "ffn_norm", il); } else { // parallel residual cur = inpSA; } - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -3189,7 +2655,7 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -3198,14 +2664,14 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, model.output_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -3222,25 +2688,24 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); + cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -3270,9 +2735,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -3287,12 +2752,12 @@ struct llm_build_context { // feed-forward forward { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -3302,7 +2767,7 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -3311,13 +2776,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -3335,37 +2800,36 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); cb(Vcur, "Vcur", il); @@ -3384,9 +2848,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -3400,12 +2864,12 @@ struct llm_build_context { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -3414,7 +2878,7 @@ struct llm_build_context { cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -3423,13 +2887,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -3446,7 +2910,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions lctx.inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens * 4); @@ -3454,8 +2918,8 @@ struct llm_build_context { ggml_set_input(lctx.inp_pos); struct ggml_tensor * inp_pos = lctx.inp_pos; - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + int sections[4]; std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); @@ -3463,25 +2927,25 @@ struct llm_build_context { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); cb(Vcur, "Vcur", il); @@ -3502,9 +2966,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -3518,12 +2982,12 @@ struct llm_build_context { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -3532,7 +2996,7 @@ struct llm_build_context { cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -3541,13 +3005,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -3568,37 +3032,36 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self_attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); cb(Vcur, "Vcur", il); @@ -3617,9 +3080,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -3634,13 +3097,13 @@ struct llm_build_context { cb(ffn_inp, "ffn_inp", il); // MoE branch - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); ggml_tensor * moe_out = - llm_build_moe_ffn(ctx0, lctx, cur, + build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -3655,14 +3118,14 @@ struct llm_build_context { // FFN shared expert { - ggml_tensor * cur_gate_inp = llm_build_lora_mm(lctx, ctx0, model.layers[il].ffn_gate_inp_shexp, cur); + ggml_tensor * cur_gate_inp = build_lora_mm(model.layers[il].ffn_gate_inp_shexp, cur); cb(cur_gate_inp, "ffn_shexp_gate_inp", il); // sigmoid ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp); cb(cur_gate, "ffn_shexp_gate", il); - ggml_tensor * cur_ffn = llm_build_ffn(ctx0, lctx, cur, + ggml_tensor * cur_ffn = build_ffn(cur, model.layers[il].ffn_up_shexp, NULL, NULL, model.layers[il].ffn_gate_shexp, NULL, NULL, model.layers[il].ffn_down_shexp, NULL, NULL, @@ -3680,7 +3143,7 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -3689,13 +3152,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -3715,19 +3178,18 @@ struct llm_build_context { struct ggml_tensor * ffn_output; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { - attn_norm_output = llm_build_norm(ctx0, inpL, hparams, + attn_norm_output = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(attn_norm_output, "attn_norm", il); // self-attention @@ -3737,7 +3199,7 @@ struct llm_build_context { struct ggml_tensor * Vcur = nullptr; if (model.layers[il].wqkv) { - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, attn_norm_output); + cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -3747,9 +3209,9 @@ struct llm_build_context { Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); } else { - Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq); - Kcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk); - Vcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv); + Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq); + Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk); + Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv); } cb(Qcur, "Qcur", il); @@ -3776,9 +3238,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f, cb, il); } if (il == n_layer - 1) { @@ -3791,7 +3253,7 @@ struct llm_build_context { // FF { - ffn_output = llm_build_ffn(ctx0, lctx, attn_norm_output, + ffn_output = build_ffn(attn_norm_output, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -3802,20 +3264,20 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_output); cur = ggml_add(ctx0, cur, inpL); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer inpL = cur; } - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.output_norm, model.output_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output_no_bias", -1); cur = ggml_add(ctx0, cur, model.output_b); @@ -3834,19 +3296,13 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = nullptr; - if (hparams.n_swa == 0) { - // Phi-4 doesn't use sliding window attention - KQ_mask = build_inp_KQ_mask(); - } else { - KQ_mask = build_inp_KQ_mask_swa(); - } + lctx.build_attn_inp(ctx0, n_tokens, true, true, worst_case); for (int il = 0; il < n_layer; ++il) { auto residual = inpL; @@ -3854,12 +3310,12 @@ struct llm_build_context { // self-attention { // rope freq factors for 128k context - struct ggml_tensor * rope_factors = build_rope_factors(il); + struct ggml_tensor * rope_factors = lctx.get_rope_factors(il); - struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams, + struct ggml_tensor* attn_norm_output = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(attn_norm_output, "attn_norm", il); struct ggml_tensor * Qcur = nullptr; @@ -3867,16 +3323,16 @@ struct llm_build_context { struct ggml_tensor * Vcur = nullptr; if (model.layers[il].wqkv) { - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, attn_norm_output); + cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output); cb(cur, "wqkv", il); Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd))); Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd))); Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa))); } else { - Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq); - Kcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk); - Vcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv); + Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq); + Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk); + Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv); } cb(Qcur, "Qcur", il); @@ -3901,9 +3357,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f, cb, il); } if (il == n_layer - 1) { @@ -3916,14 +3372,14 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, residual); residual = cur; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); // feed-forward network if (model.layers[il].ffn_gate_inp == nullptr) { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -3932,7 +3388,7 @@ struct llm_build_context { cb(cur, "ffn_out", il); } else { // MoE branch - cur = llm_build_moe_ffn(ctx0, lctx, cur, + cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -3947,20 +3403,20 @@ struct llm_build_context { } cur = ggml_add(ctx0, residual, cur); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer inpL = cur; } - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.output_norm, model.output_norm_b, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); if (model.output_b != nullptr) { cb(cur, "result_output_no_bias", -1); @@ -3984,20 +3440,19 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); struct ggml_tensor * attention_norm = cur; @@ -4005,13 +3460,13 @@ struct llm_build_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Qcur = ggml_rope_ext( @@ -4026,9 +3481,9 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } struct ggml_tensor * sa_out = cur; @@ -4044,7 +3499,7 @@ struct llm_build_context { // feed-forward network { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -4055,7 +3510,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, sa_out); cur = ggml_add(ctx0, cur, inpL); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4064,13 +3519,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -4089,13 +3544,12 @@ struct llm_build_context { struct ggml_tensor * pos; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); cb(pos, "pos_embd", -1); @@ -4104,15 +3558,15 @@ struct llm_build_context { cb(inpL, "inpL", -1); for (int il = 0; il < n_layer; ++il) { - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "attn_norm", il); // self-attention { - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); + cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -4128,9 +3582,9 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -4146,13 +3600,13 @@ struct llm_build_context { // FF { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -4162,20 +3616,20 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer inpL = cur; } - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.output_norm, model.output_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -4194,24 +3648,23 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "attn_norm", il); // self-attention { - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); + cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -4239,9 +3692,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -4257,13 +3710,13 @@ struct llm_build_context { // FF { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -4273,20 +3726,20 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer inpL = cur; } - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.output_norm, model.output_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -4304,41 +3757,40 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "attn_norm", il); // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); // if (model.layers[il].bq) { // Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); // cb(Qcur, "Qcur", il); // } - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); // if (model.layers[il].bk) { // Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); // cb(Kcur, "Kcur", il); // } - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); // if (model.layers[il].bv) { // Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -4359,9 +3811,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -4375,12 +3827,12 @@ struct llm_build_context { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -4389,7 +3841,7 @@ struct llm_build_context { cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4398,13 +3850,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, model.output_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -4422,41 +3874,40 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -4477,9 +3928,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -4493,12 +3944,12 @@ struct llm_build_context { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -4507,7 +3958,7 @@ struct llm_build_context { cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4516,13 +3967,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -4546,7 +3997,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // scale the input embeddings inpL = ggml_scale(ctx0, inpL, scale_embd); @@ -4555,17 +4006,16 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; - struct ggml_tensor * rope_factors = build_rope_factors(il); + struct ggml_tensor * rope_factors = lctx.get_rope_factors(il); // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self_attention @@ -4575,9 +4025,9 @@ struct llm_build_context { q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur); cb(q, "q", il); - q = llm_build_norm(ctx0, q, hparams, + q = build_norm(q, model.layers[il].attn_q_a_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(q, "q", il); // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens} @@ -4616,9 +4066,9 @@ struct llm_build_context { cb(k_pe, "k_pe", il); kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm - kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams, + kv_compressed = build_norm(kv_compressed, model.layers[il].attn_kv_a_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(kv_compressed, "kv_compressed", il); // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens} @@ -4670,9 +4120,9 @@ struct llm_build_context { struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0); cb(k_states, "k_states", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); + k_states, v_states, q_states, n_tokens, kq_scale, cb, il); } if (il == n_layer - 1) { @@ -4692,12 +4142,12 @@ struct llm_build_context { // feed-forward network { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -4711,7 +4161,7 @@ struct llm_build_context { cb(cur, "hidden_scaled_ffn", il); cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4720,9 +4170,9 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head scaling @@ -4731,7 +4181,7 @@ struct llm_build_context { cb(cur, "lmhead_scaling", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -4747,7 +4197,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd)); cb(inpL, "inp_scaled", -1); @@ -4755,26 +4205,25 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Qcur = ggml_rope_ext( @@ -4792,9 +4241,9 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f, cb, il); } if (il == n_layer - 1) { @@ -4807,14 +4256,14 @@ struct llm_build_context { struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL); cb(sa_out, "sa_out", il); - cur = llm_build_norm(ctx0, sa_out, hparams, + cur = build_norm(sa_out, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); // feed-forward network { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -4824,7 +4273,7 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, sa_out); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4833,13 +4282,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -4855,7 +4304,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd)); cb(inpL, "inp_scaled", -1); @@ -4863,31 +4312,25 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - // gemma 2 requires different mask for layers using sliding window (SWA) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(true); - struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa(true); + lctx.build_attn_inp(ctx0, n_tokens, true, true, worst_case); for (int il = 0; il < n_layer; ++il) { - // (il % 2) layers use SWA - struct ggml_tensor * KQ_mask_l = (il % 2 == 0) ? KQ_mask_swa : KQ_mask; - // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Qcur = ggml_rope_ext( @@ -4911,14 +4354,14 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f, cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f, cb, il); } - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_post_norm", il); if (il == n_layer - 1) { @@ -4931,14 +4374,14 @@ struct llm_build_context { struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL); cb(sa_out, "sa_out", il); - cur = llm_build_norm(ctx0, sa_out, hparams, + cur = build_norm(sa_out, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); // feed-forward network { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -4947,13 +4390,13 @@ struct llm_build_context { cb(cur, "ffn_out", il); } - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "ffn_post_norm", -1); cur = ggml_add(ctx0, cur, sa_out); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4962,13 +4405,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); // final logit soft-capping cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping); @@ -4993,41 +4436,40 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "attn_norm", il); // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -5048,9 +4490,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -5065,12 +4507,12 @@ struct llm_build_context { // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -5079,7 +4521,7 @@ struct llm_build_context { cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -5088,13 +4530,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, model.output_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -5109,21 +4551,20 @@ struct llm_build_context { struct ggml_tensor * inpL; // {n_embd, n_tokens} - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); - struct ggml_tensor * state_copy = build_inp_s_copy(); - struct ggml_tensor * state_mask = build_inp_s_mask(); + struct ggml_tensor * state_copy = lctx.build_inp_s_copy(ctx0, worst_case); + struct ggml_tensor * state_mask = lctx.build_inp_s_mask(ctx0, worst_case); for (int il = 0; il < n_layer; ++il) { // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); - cur = llm_build_mamba(ctx0, lctx, ubatch, gf, cur, - state_copy, state_mask, - kv_head, n_kv, cb, il); + //cur = build_mamba_layer(gf, cur, state_copy, state_mask, il); + cur = lctx.build_mamba_layer(ctx0, gf, cur, state_copy, state_mask, ubatch, il, worst_case); if (il == n_layer - 1) { // skip computing output for unused tokens @@ -5142,13 +4583,13 @@ struct llm_build_context { } // final rmsnorm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -5167,41 +4608,40 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "attn_norm", il); struct ggml_tensor * ffn_inp = cur; // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -5220,16 +4660,16 @@ struct llm_build_context { 0); cb(Kcur, "Kcur", il); - Qcur = llm_build_norm(ctx0, Qcur, hparams, + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, - LLM_NORM, cb, il); + LLM_NORM, il); cb(Qcur, "Qcur", il); - Kcur = llm_build_norm(ctx0, Kcur, hparams, + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, - LLM_NORM, cb, il); + LLM_NORM, il); cb(Kcur, "Kcur", il); } @@ -5247,9 +4687,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -5264,7 +4704,7 @@ struct llm_build_context { // feed-forward network { - cur = llm_build_ffn(ctx0, lctx, ffn_inp, + cur = build_ffn(ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -5276,7 +4716,7 @@ struct llm_build_context { // add together residual + FFN + self-attention cur = ggml_add(ctx0, cur, inpL); cur = ggml_add(ctx0, cur, attn_out); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -5285,13 +4725,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); if (f_logit_scale) { cur = ggml_scale(ctx0, cur, f_logit_scale); @@ -5315,15 +4755,12 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - // cohere2 requires different mask for layers using sliding window (SWA) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); - struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa(); + lctx.build_attn_inp(ctx0, n_tokens, true, true, worst_case); // sliding window switch pattern const int32_t sliding_window_pattern = 4; @@ -5331,35 +4768,34 @@ struct llm_build_context { for (int il = 0; il < n_layer; ++il) { // three layers sliding window attention (window size 4096) and ROPE // fourth layer uses global attention without positional embeddings - const bool is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1); - struct ggml_tensor * KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask; + const bool is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1); // norm - cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM, cb, il); + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il); cb(cur, "attn_norm", il); struct ggml_tensor * ffn_inp = cur; // self-attention { // rope freq factors for 128k context - struct ggml_tensor * rope_factors = build_rope_factors(il); + struct ggml_tensor * rope_factors = lctx.get_rope_factors(il); // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -5385,8 +4821,8 @@ struct llm_build_context { cb(Kcur, "Kcur", il); } - cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, - KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il); + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, + n_tokens, 1.0f / sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -5401,7 +4837,7 @@ struct llm_build_context { // feed-forward network { - cur = llm_build_ffn(ctx0, lctx, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, + cur = build_ffn(ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); @@ -5410,7 +4846,7 @@ struct llm_build_context { // add together residual + FFN + self-attention cur = ggml_add(ctx0, cur, inpL); cur = ggml_add(ctx0, cur, attn_out); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -5419,11 +4855,11 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM, cb, -1); + cur = build_norm(cur, model.output_norm, NULL, LLM_NORM, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); if (f_logit_scale) { cur = ggml_scale(ctx0, cur, f_logit_scale); @@ -5455,41 +4891,40 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, NULL, NULL, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "attn_norm", il); // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (hparams.f_clamp_kqv > 0.0f) { Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (hparams.f_clamp_kqv > 0.0f) { Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (hparams.f_clamp_kqv > 0.0f) { Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); @@ -5510,9 +4945,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, nullptr, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -5527,12 +4962,12 @@ struct llm_build_context { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, NULL, NULL, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -5543,7 +4978,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -5552,13 +4987,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, NULL, NULL, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -5579,13 +5014,12 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -5595,21 +5029,21 @@ struct llm_build_context { // self_attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); - Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL, - LLM_NORM_RMS, cb, il); + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, + LLM_NORM_RMS, il); cb(Qcur, "Qcur_normed", il); - Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL, - LLM_NORM_RMS, cb, il); + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, + LLM_NORM_RMS, il); cb(Kcur, "Kcur_normed", il); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); @@ -5629,14 +5063,14 @@ struct llm_build_context { ); cb(Kcur, "Kcur_rope", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_post_norm", il); if (il == n_layer - 1) { @@ -5651,7 +5085,7 @@ struct llm_build_context { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_ffn(ctx0, lctx, ffn_inp, + cur = build_ffn(ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -5659,15 +5093,15 @@ struct llm_build_context { LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "ffn_post_norm", -1); cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -5676,13 +5110,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -5707,41 +5141,40 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self_attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); - Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL, - LLM_NORM_RMS, cb, il); + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, + LLM_NORM_RMS, il); cb(Qcur, "Qcur_normed", il); - Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL, - LLM_NORM_RMS, cb, il); + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, + LLM_NORM_RMS, il); cb(Kcur, "Kcur_normed", il); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); @@ -5761,9 +5194,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur_rope", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -5778,12 +5211,12 @@ struct llm_build_context { cb(ffn_inp, "ffn_inp", il); // MoE branch - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_moe_ffn(ctx0, lctx, cur, + cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -5797,7 +5230,7 @@ struct llm_build_context { cb(cur, "ffn_moe_out", il); cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -5806,13 +5239,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -5828,13 +5261,12 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { const int64_t n_head = hparams.n_head(il); @@ -5845,14 +5277,14 @@ struct llm_build_context { struct ggml_tensor * residual = cur; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); + cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens); @@ -5866,14 +5298,14 @@ struct llm_build_context { struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv))); cb(Vcur, "Vcur", il); - Qcur = llm_build_norm(ctx0, Qcur, hparams, + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(Qcur, "Qcur", il); - Kcur = llm_build_norm(ctx0, Kcur, hparams, + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(Kcur, "Kcur", il); Qcur = ggml_rope_ext( @@ -5891,9 +5323,9 @@ struct llm_build_context { Vcur = ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_kv, n_tokens); cb(Qcur, "Vcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -5908,12 +5340,12 @@ struct llm_build_context { // feed-forward network { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -5923,7 +5355,7 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); inpL = cur; @@ -5932,12 +5364,12 @@ struct llm_build_context { cur = inpL; // norm - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -5955,24 +5387,23 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "attn_norm", il); // self-attention { - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); + cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -6000,9 +5431,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -6019,13 +5450,13 @@ struct llm_build_context { struct ggml_tensor * attn_out = cur; - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -6037,7 +5468,7 @@ struct llm_build_context { cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, attn_out); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -6050,13 +5481,13 @@ struct llm_build_context { struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); cb(ffn_inp, "ffn_inp", il); - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -6065,7 +5496,7 @@ struct llm_build_context { cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -6073,13 +5504,13 @@ struct llm_build_context { } } - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.output_norm, model.output_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -6100,33 +5531,32 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Qcur = ggml_rope_ext( @@ -6143,9 +5573,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -6160,12 +5590,12 @@ struct llm_build_context { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -6177,12 +5607,12 @@ struct llm_build_context { cb(ffn_out, "ffn_out", il); // MoE - cur = llm_build_norm(ctx0, inpSA, hparams, + cur = build_norm(inpSA, model.layers[il].ffn_norm_exps, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm_exps", il); - cur = llm_build_moe_ffn(ctx0, lctx, cur, + cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -6198,7 +5628,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_out); cb(cur, "ffn_out", il); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -6207,13 +5637,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -6234,44 +5664,45 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; + for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { // rope freq factors for llama3; may return nullptr for llama2 and other models - struct ggml_tensor * rope_factors = build_rope_factors(il); + struct ggml_tensor * rope_factors = lctx.get_rope_factors(il); // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -6292,9 +5723,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); + Kcur, Vcur, Qcur, n_tokens, kq_scale, cb, il); } if (il == n_layer - 1) { @@ -6309,13 +5740,13 @@ struct llm_build_context { struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); if ((uint32_t) il < hparams.n_layer_dense_lead) { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -6325,7 +5756,7 @@ struct llm_build_context { } else { // MoE branch ggml_tensor * moe_out = - llm_build_moe_ffn(ctx0, lctx, cur, + build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -6340,7 +5771,7 @@ struct llm_build_context { // FFN shared expert { - ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur, + ggml_tensor * ffn_shexp = build_ffn(cur, model.layers[il].ffn_up_shexp, NULL, NULL, model.layers[il].ffn_gate_shexp, NULL, NULL, model.layers[il].ffn_down_shexp, NULL, NULL, @@ -6354,7 +5785,7 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -6363,13 +5794,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); @@ -6400,21 +5831,20 @@ struct llm_build_context { struct ggml_tensor * inpL; // {n_embd, n_tokens} - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self_attention @@ -6425,9 +5855,9 @@ struct llm_build_context { q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur); cb(q, "q", il); - q = llm_build_norm(ctx0, q, hparams, + q = build_norm(q, model.layers[il].attn_q_a_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(q, "q", il); // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens} @@ -6470,9 +5900,9 @@ struct llm_build_context { cb(k_pe, "k_pe", il); kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm - kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams, + kv_compressed = build_norm(kv_compressed, model.layers[il].attn_kv_a_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(kv_compressed, "kv_compressed", il); // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens} @@ -6524,9 +5954,9 @@ struct llm_build_context { struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0); cb(k_states, "k_states", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); + k_states, v_states, q_states, n_tokens, kq_scale, cb, il); } if (il == n_layer - 1) { @@ -6540,13 +5970,13 @@ struct llm_build_context { struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); if ((uint32_t) il < hparams.n_layer_dense_lead) { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -6556,7 +5986,7 @@ struct llm_build_context { } else { // MoE branch ggml_tensor * moe_out = - llm_build_moe_ffn(ctx0, lctx, cur, + build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -6571,7 +6001,7 @@ struct llm_build_context { // FFN shared expert { - ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur, + ggml_tensor * ffn_shexp = build_ffn(cur, model.layers[il].ffn_up_shexp, NULL, NULL, model.layers[il].ffn_gate_shexp, NULL, NULL, model.layers[il].ffn_down_shexp, NULL, NULL, @@ -6585,7 +6015,7 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -6594,9 +6024,9 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head @@ -6617,26 +6047,25 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); if (model.layers[il].wq_scale) { Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale); } @@ -6647,7 +6076,7 @@ struct llm_build_context { } // B1.K - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); if (model.layers[il].wk_scale) { Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale); } @@ -6658,7 +6087,7 @@ struct llm_build_context { } // B1.V - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); if (model.layers[il].wv_scale) { Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale); } @@ -6682,16 +6111,16 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, NULL, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.layers[il].attn_sub_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_sub_norm", il); - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur); + cur = build_lora_mm(model.layers[il].wo, cur); if (model.layers[il].wo_scale) { cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale); } @@ -6712,12 +6141,12 @@ struct llm_build_context { cb(ffn_inp, "ffn_inp", il); // feed-forward forward - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_scale, model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_scale, NULL, NULL, NULL, @@ -6725,12 +6154,12 @@ struct llm_build_context { LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_sub_out", il); - cur = llm_build_norm(ctx0, cur, hparams, - model.layers[il].ffn_sub_norm, NULL, - LLM_NORM_RMS, cb, il); + cur = build_norm(cur, + model.layers[il].ffn_sub_norm, NULL, + LLM_NORM_RMS, il); cb(cur, "ffn_sub_norm", il); - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].ffn_down, cur); + cur = build_lora_mm(model.layers[il].ffn_down, cur); if (model.layers[il].ffn_down_scale) { cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale); } @@ -6745,356 +6174,356 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head // FIXME: do not use model.tok_embd directly, duplicate as model.output - cur = llm_build_lora_mm(lctx, ctx0, model.tok_embd, cur); + cur = build_lora_mm(model.tok_embd, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); return gf; } - struct ggml_cgraph * build_t5_enc() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + //struct ggml_cgraph * build_t5_enc() { + // struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - // mutable variable, needed during the last layer of the computation to skip unused tokens - int32_t n_tokens = this->n_tokens; + // // mutable variable, needed during the last layer of the computation to skip unused tokens + // int32_t n_tokens = this->n_tokens; - const int64_t n_embd_head = hparams.n_embd_head_v; - const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + // const int64_t n_embd_head = hparams.n_embd_head_v; + // const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + // GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + // struct ggml_tensor * cur; + // struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + // inpL = build_inp_embd(model.tok_embd); - GGML_ASSERT(lctx.is_encoding); - struct ggml_tensor * pos_bucket_enc = build_pos_bucket(false); + // GGML_ASSERT(lctx.is_encoding); + // struct ggml_tensor * pos_bucket_enc = build_pos_bucket(false); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false); + // // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + // struct ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false); - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + // for (int il = 0; il < n_layer; ++il) { + // struct ggml_tensor * inpSA = inpL; - // norm - cur = llm_build_norm(ctx0, inpL, hparams, - model.layers[il].attn_norm_enc, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "attn_norm", il); + // // norm + // cur = build_norm(inpL, + // model.layers[il].attn_norm_enc, NULL, + // LLM_NORM_RMS, il); + // cb(cur, "attn_norm", il); - // self-attention - { - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_enc, cur); - cb(Qcur, "Qcur", il); + // // self-attention + // { + // struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_enc, cur); + // cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_enc, cur); - cb(Kcur, "Kcur", il); + // struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_enc, cur); + // cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_enc, cur); - cb(Vcur, "Vcur", il); + // struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_enc, cur); + // cb(Vcur, "Vcur", il); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + // Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + // Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); + // struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + // struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); - struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); - cb(kq, "kq", il); + // struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + // cb(kq, "kq", il); - struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc; - struct ggml_tensor * pos_bias = build_pos_bias(pos_bucket_enc, attn_rel_b); - struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias); - cb(kq_b, "kq_b", il); + // struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc; + // struct ggml_tensor * pos_bias = build_pos_bias(pos_bucket_enc, attn_rel_b); + // struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias); + // cb(kq_b, "kq_b", il); - kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_enc, 1.0f, hparams.f_max_alibi_bias); - cb(kq, "kq_soft_max_ext", il); + // kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_enc, 1.0f, hparams.f_max_alibi_bias); + // cb(kq, "kq_soft_max_ext", il); - struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens))); - cb(v, "v", il); + // struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens))); + // cb(v, "v", il); - struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq); - cb(kqv, "kqv", il); + // struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq); + // cb(kqv, "kqv", il); - struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); - cb(kqv_merged, "kqv_merged", il); + // struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); + // cb(kqv_merged, "kqv_merged", il); - cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); - cb(cur, "kqv_merged_cont", il); + // cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); + // cb(cur, "kqv_merged_cont", il); - ggml_build_forward_expand(gf, cur); + // ggml_build_forward_expand(gf, cur); - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_enc, cur); - cb(cur, "kqv_out", il); - } + // cur = build_lora_mm(model.layers[il].wo_enc, cur); + // cb(cur, "kqv_out", il); + // } - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } + // if (il == n_layer - 1) { + // // skip computing output for unused tokens + // struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + // n_tokens = n_outputs; + // cur = ggml_get_rows(ctx0, cur, inp_out_ids); + // inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + // } - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); + // struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + // cb(ffn_inp, "ffn_inp", il); - // feed-forward network - { - cur = llm_build_norm(ctx0, ffn_inp, hparams, - model.layers[il].ffn_norm_enc, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); + // // feed-forward network + // { + // cur = build_norm(ffn_inp, + // model.layers[il].ffn_norm_enc, NULL, + // LLM_NORM_RMS, il); + // cb(cur, "ffn_norm", il); - // T5 uses relu, flan-T5 uses gelu-gated - cur = llm_build_ffn(ctx0, lctx, cur, - model.layers[il].ffn_up_enc, NULL, NULL, - model.layers[il].ffn_gate_enc, NULL, NULL, - model.layers[il].ffn_down_enc, NULL, NULL, - NULL, - model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU, - model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ, - cb, il); - cb(cur, "ffn_out", il); - } + // // T5 uses relu, flan-T5 uses gelu-gated + // cur = build_ffn(cur, + // model.layers[il].ffn_up_enc, NULL, NULL, + // model.layers[il].ffn_gate_enc, NULL, NULL, + // model.layers[il].ffn_down_enc, NULL, NULL, + // NULL, + // model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU, + // model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ, + // cb, il); + // cb(cur, "ffn_out", il); + // } - cur = ggml_add(ctx0, cur, ffn_inp); - cb(cur, "ffn_out", il); + // cur = ggml_add(ctx0, cur, ffn_inp); + // cb(cur, "ffn_out", il); - ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); - if (layer_dir != nullptr) { - cur = ggml_add(ctx0, cur, layer_dir); - } - cb(cur, "l_out", il); + // ggml_tensor * layer_dir = cvec.tensor_for(il); + // if (layer_dir != nullptr) { + // cur = ggml_add(ctx0, cur, layer_dir); + // } + // cb(cur, "l_out", il); - // input for next layer - inpL = cur; - } + // // input for next layer + // inpL = cur; + // } - cur = inpL; - cb(cur, "result_embd", -1); + // cur = inpL; + // cb(cur, "result_embd", -1); - cur = llm_build_norm(ctx0, cur, hparams, - model.output_norm_enc, NULL, - LLM_NORM_RMS, cb, -1); - cb(cur, "result_norm", -1); + // cur = build_norm(cur, + // model.output_norm_enc, NULL, + // LLM_NORM_RMS, -1); + // cb(cur, "result_norm", -1); - ggml_build_forward_expand(gf, cur); + // ggml_build_forward_expand(gf, cur); - return gf; - } + // return gf; + //} - struct ggml_cgraph * build_t5_dec() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + //struct ggml_cgraph * build_t5_dec() { + // struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - // mutable variable, needed during the last layer of the computation to skip unused tokens - int32_t n_tokens = this->n_tokens; + // // mutable variable, needed during the last layer of the computation to skip unused tokens + // int32_t n_tokens = this->n_tokens; - const int64_t n_embd_head = hparams.n_embd_head_v; - const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + // const int64_t n_embd_head = hparams.n_embd_head_v; + // const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + // GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + // struct ggml_tensor * cur; + // struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + // inpL = build_inp_embd(model.tok_embd); - GGML_ASSERT(!lctx.is_encoding); - GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first"); + // GGML_ASSERT(!lctx.is_encoding); + // GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first"); - struct ggml_tensor * embd_enc = build_inp_embd_enc(); - struct ggml_tensor * pos_bucket_dec = build_pos_bucket(true); + // struct ggml_tensor * embd_enc = build_inp_embd_enc(); + // struct ggml_tensor * pos_bucket_dec = build_pos_bucket(true); - struct ggml_tensor * KQ_mask_dec = build_inp_KQ_mask(); - struct ggml_tensor * KQ_mask_cross = build_inp_KQ_mask_cross(); + // struct ggml_tensor * KQ_mask_dec = build_inp_KQ_mask(); + // struct ggml_tensor * KQ_mask_cross = build_inp_KQ_mask_cross(); - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + // for (int il = 0; il < n_layer; ++il) { + // struct ggml_tensor * inpSA = inpL; - // norm - cur = llm_build_norm(ctx0, inpL, hparams, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "attn_norm", il); + // // norm + // cur = build_norm(inpL, + // model.layers[il].attn_norm, NULL, + // LLM_NORM_RMS, il); + // cb(cur, "attn_norm", il); - // self-attention - { - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); + // // self-attention + // { + // struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + // cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); + // struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + // cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); + // struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + // cb(Vcur, "Vcur", il); - llm_build_kv_store(ctx0, hparams, cparams, kv_self, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); + // build_kv_store(gf, Kcur, Vcur, il); - struct ggml_tensor * k = - ggml_view_3d(ctx0, kv_self.k_l[il], - n_embd_head_k, n_kv, n_head_kv, - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), - ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), - 0); - cb(k, "k", il); + // struct ggml_tensor * k = + // ggml_view_3d(ctx0, kv_self.k_l[il], + // n_embd_head_k, n_kv, n_head_kv, + // ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), + // ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), + // 0); + // cb(k, "k", il); - struct ggml_tensor * v = - ggml_view_3d(ctx0, kv_self.v_l[il], - n_kv, n_embd_head_v, n_head_kv, - ggml_element_size(kv_self.v_l[il])*n_ctx, - ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v, - 0); - cb(v, "v", il); + // struct ggml_tensor * v = + // ggml_view_3d(ctx0, kv_self.v_l[il], + // n_kv, n_embd_head_v, n_head_kv, + // ggml_element_size(kv_self.v_l[il])*n_ctx, + // ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v, + // 0); + // cb(v, "v", il); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + // Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + // struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); - cb(kq, "kq", il); + // struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + // cb(kq, "kq", il); - struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b; - struct ggml_tensor * pos_bias = build_pos_bias(pos_bucket_dec, attn_rel_b); - struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias); - cb(kq_b, "kq_b", il); + // struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b; + // struct ggml_tensor * pos_bias = build_pos_bias(pos_bucket_dec, attn_rel_b); + // struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias); + // cb(kq_b, "kq_b", il); - kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_dec, 1.0f, hparams.f_max_alibi_bias); - cb(kq, "kq_soft_max_ext", il); + // kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_dec, 1.0f, hparams.f_max_alibi_bias); + // cb(kq, "kq_soft_max_ext", il); - struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); - cb(kqv, "kqv", il); + // struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); + // cb(kqv, "kqv", il); - struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); - cb(kqv_merged, "kqv_merged", il); + // struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); + // cb(kqv_merged, "kqv_merged", il); - cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); - cb(cur, "kqv_merged_cont", il); + // cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); + // cb(cur, "kqv_merged_cont", il); - ggml_build_forward_expand(gf, cur); + // ggml_build_forward_expand(gf, cur); - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur); - cb(cur, "kqv_out", il); - } + // cur = build_lora_mm(model.layers[il].wo, cur); + // cb(cur, "kqv_out", il); + // } - cur = ggml_add(ctx0, cur, inpSA); - cb(cur, "cross_inp", il); + // cur = ggml_add(ctx0, cur, inpSA); + // cb(cur, "cross_inp", il); - struct ggml_tensor * inpCA = cur; + // struct ggml_tensor * inpCA = cur; - // norm - cur = llm_build_norm(ctx0, cur, hparams, - model.layers[il].attn_norm_cross, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "attn_norm_cross", il); + // // norm + // cur = build_norm(cur, + // model.layers[il].attn_norm_cross, NULL, + // LLM_NORM_RMS, il); + // cb(cur, "attn_norm_cross", il); - // cross-attention - { - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_cross, cur); - cb(Qcur, "Qcur", il); + // // cross-attention + // { + // struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_cross, cur); + // cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_cross, embd_enc); - cb(Kcur, "Kcur", il); + // struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_cross, embd_enc); + // cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_cross, embd_enc); - cb(Vcur, "Vcur", il); + // struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_cross, embd_enc); + // cb(Vcur, "Vcur", il); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc); + // Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + // Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc); - struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); + // struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + // struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); - struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); - cb(kq, "kq", il); + // struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + // cb(kq, "kq", il); - kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias); - cb(kq, "kq_soft_max_ext", il); + // kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias); + // cb(kq, "kq_soft_max_ext", il); - struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc))); - cb(v, "v", il); + // struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc))); + // cb(v, "v", il); - struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq); - cb(kqv, "kqv", il); + // struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq); + // cb(kqv, "kqv", il); - struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); - cb(kqv_merged, "kqv_merged", il); + // struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); + // cb(kqv_merged, "kqv_merged", il); - cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); - cb(cur, "kqv_merged_cont", il); + // cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); + // cb(cur, "kqv_merged_cont", il); - ggml_build_forward_expand(gf, cur); + // ggml_build_forward_expand(gf, cur); - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_cross, cur); - cb(cur, "kqv_out", il); - } + // cur = build_lora_mm(model.layers[il].wo_cross, cur); + // cb(cur, "kqv_out", il); + // } - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids); - } + // if (il == n_layer - 1) { + // // skip computing output for unused tokens + // struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + // n_tokens = n_outputs; + // cur = ggml_get_rows(ctx0, cur, inp_out_ids); + // inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + // inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids); + // } - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA); - cb(ffn_inp, "ffn_inp", il); + // struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA); + // cb(ffn_inp, "ffn_inp", il); - // feed-forward network - { - cur = llm_build_norm(ctx0, ffn_inp, hparams, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); + // // feed-forward network + // { + // cur = build_norm(ffn_inp, + // model.layers[il].ffn_norm, NULL, + // LLM_NORM_RMS, il); + // cb(cur, "ffn_norm", il); - // T5 uses relu, flan-T5 uses gelu-gated - cur = llm_build_ffn(ctx0, lctx, cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU, - model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ, - cb, il); - cb(cur, "ffn_out", il); - } + // // T5 uses relu, flan-T5 uses gelu-gated + // cur = build_ffn(cur, + // model.layers[il].ffn_up, NULL, NULL, + // model.layers[il].ffn_gate, NULL, NULL, + // model.layers[il].ffn_down, NULL, NULL, + // NULL, + // model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU, + // model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ, + // cb, il); + // cb(cur, "ffn_out", il); + // } - cur = ggml_add(ctx0, cur, ffn_inp); - cb(cur, "ffn_out", il); + // cur = ggml_add(ctx0, cur, ffn_inp); + // cb(cur, "ffn_out", il); - ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); - if (layer_dir != nullptr) { - cur = ggml_add(ctx0, cur, layer_dir); - } - cb(cur, "l_out", il); + // ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); + // if (layer_dir != nullptr) { + // cur = ggml_add(ctx0, cur, layer_dir); + // } + // cb(cur, "l_out", il); - // input for next layer - inpL = cur; - } + // // input for next layer + // inpL = cur; + // } - cur = inpL; - cb(cur, "result_embd", -1); + // cur = inpL; + // cb(cur, "result_embd", -1); - cur = llm_build_norm(ctx0, cur, hparams, - model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); - cb(cur, "result_norm", -1); - - // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); + // cur = build_norm(cur, + // model.output_norm, NULL, + // LLM_NORM_RMS, -1); + // cb(cur, "result_norm", -1); + + // // lm_head + // cur = build_lora_mm(model.output, cur); + // cb(cur, "result_output", -1); + + // ggml_build_forward_expand(gf, cur); - return gf; - } + // return gf; + //} struct ggml_cgraph * build_jais() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); @@ -7106,21 +6535,20 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "attn_norm", il); // self-attention { - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); + cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -7136,9 +6564,9 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/float(n_embd_head), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/float(n_embd_head), cb, il); } if (il == n_layer - 1) { @@ -7154,13 +6582,13 @@ struct llm_build_context { // FF { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -7173,13 +6601,13 @@ struct llm_build_context { cb(inpL, "l_out", il); } - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.output_norm, model.output_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); @@ -7198,21 +6626,20 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention @@ -7221,7 +6648,7 @@ struct llm_build_context { struct ggml_tensor * Kcur = nullptr; struct ggml_tensor * Vcur = nullptr; - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); + cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -7249,9 +6676,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur_rope", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -7268,13 +6695,13 @@ struct llm_build_context { // FF { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -7288,13 +6715,13 @@ struct llm_build_context { cb(inpL, "l_out", il); } - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -7312,42 +6739,41 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "attn_norm", il); // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -7368,9 +6794,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -7384,13 +6810,13 @@ struct llm_build_context { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -7409,13 +6835,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, model.output_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -7436,44 +6862,43 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { // rope freq factors for llama3; may return nullptr for llama2 and other models - struct ggml_tensor * rope_factors = build_rope_factors(il); + struct ggml_tensor * rope_factors = lctx.get_rope_factors(il); // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -7494,9 +6919,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -7511,12 +6936,12 @@ struct llm_build_context { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -7536,13 +6961,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -7550,230 +6975,232 @@ struct llm_build_context { return gf; } - ggml_cgraph * build_rwkv6() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + //ggml_cgraph * build_rwkv6() { + // struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - // Token shift state dimensions should be 2 * n_emb - GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2); + // // Token shift state dimensions should be 2 * n_emb + // GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2); - const int64_t n_seqs = ubatch.n_seqs; - const int64_t n_seq_tokens = ubatch.n_seq_tokens; - const int64_t n_tokens = ubatch.n_tokens; - GGML_ASSERT(n_seqs != 0); - GGML_ASSERT(ubatch.equal_seqs); - GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs); + // const int64_t n_seqs = ubatch.n_seqs; + // const int64_t n_seq_tokens = ubatch.n_seq_tokens; + // const int64_t n_tokens = ubatch.n_tokens; + // GGML_ASSERT(n_seqs != 0); + // GGML_ASSERT(ubatch.equal_seqs); + // GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - struct ggml_tensor * state_copy = build_inp_s_copy(); - struct ggml_tensor * state_mask = build_inp_s_mask(); + // struct ggml_tensor * cur; + // struct ggml_tensor * inpL; + // struct ggml_tensor * state_copy = build_inp_s_copy(); + // struct ggml_tensor * state_mask = build_inp_s_mask(); - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); - inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1); + // inpL = build_inp_embd(model.tok_embd); + // inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1); - for (int il = 0; il < n_layer; ++il) { - const llama_layer * layer = &model.layers[il]; + // for (int il = 0; il < n_layer; ++il) { + // const llama_layer * layer = &model.layers[il]; - // (ab)using the KV cache to store the states - struct ggml_tensor * token_shift = llm_build_copy_mask_state(ctx0, - gf, kv_self.k_l[il], state_copy, state_mask, - hparams.n_embd_k_s(), kv_self.size, kv_head, n_kv, n_seqs); - struct ggml_tensor * wkv_states = llm_build_copy_mask_state(ctx0, - gf, kv_self.v_l[il], state_copy, state_mask, - hparams.n_embd_v_s(), kv_self.size, kv_head, n_kv, n_seqs); + // // (ab)using the KV cache to store the states + // struct ggml_tensor * token_shift = build_copy_mask_state( + // gf, kv_self.k_l[il], state_copy, state_mask, + // hparams.n_embd_k_s(), n_seqs); - cur = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs); - token_shift = ggml_reshape_3d(ctx0, token_shift, n_embd, 2, n_seqs); + // struct ggml_tensor * wkv_states = build_copy_mask_state( + // gf, kv_self.v_l[il], state_copy, state_mask, + // hparams.n_embd_v_s(), n_seqs); - struct ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0); - struct ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift)); + // cur = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs); + // token_shift = ggml_reshape_3d(ctx0, token_shift, n_embd, 2, n_seqs); - struct ggml_tensor * x_norm_att = llm_build_norm(ctx0, cur, hparams, layer->attn_norm, layer->attn_norm_b, LLM_NORM, cb, il); - struct ggml_tensor * x_prev = ggml_concat( - ctx0, - att_shift, - ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0), - 1 - ); + // struct ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0); + // struct ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift)); - cur = ggml_add(ctx0, cur, llm_build_rwkv6_time_mix(lctx, ctx0, layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, n_embd / hparams.wkv_head_size)); - ggml_build_forward_expand(gf, cur); - ggml_build_forward_expand( - gf, - ggml_cpy( - ctx0, - wkv_states, - ggml_view_1d( - ctx0, - kv_self.v_l[il], - hparams.n_embd_v_s() * n_seqs, - hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il]) - ) - ) - ); + // struct ggml_tensor * x_norm_att = build_norm(cur, layer->attn_norm, layer->attn_norm_b, LLM_NORM, il); + // struct ggml_tensor * x_prev = ggml_concat( + // ctx0, + // att_shift, + // ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0), + // 1 + // ); - struct ggml_tensor * x_norm_ffn = llm_build_norm(ctx0, cur, hparams, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, cb, il); - x_prev = ggml_concat( - ctx0, - ffn_shift, - ggml_view_3d(ctx0, x_norm_ffn, n_embd, n_seq_tokens - 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], 0), - 1 - ); - cur = ggml_add(ctx0, cur, llm_build_rwkv6_channel_mix(lctx, ctx0, layer, x_norm_ffn, x_prev)); - ggml_build_forward_expand(gf, cur); + // cur = ggml_add(ctx0, cur, build_rwkv6_time_mix(layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, n_embd / hparams.wkv_head_size)); + // ggml_build_forward_expand(gf, cur); + // ggml_build_forward_expand( + // gf, + // ggml_cpy( + // ctx0, + // wkv_states, + // ggml_view_1d( + // ctx0, + // kv_self.v_l[il], + // hparams.n_embd_v_s() * n_seqs, + // hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il]) + // ) + // ) + // ); - struct ggml_tensor * last_norm_att = ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_att)); - struct ggml_tensor * last_norm_ffn = ggml_view_3d(ctx0, x_norm_ffn, n_embd, 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_ffn)); + // struct ggml_tensor * x_norm_ffn = build_norm(cur, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, il); + // x_prev = ggml_concat( + // ctx0, + // ffn_shift, + // ggml_view_3d(ctx0, x_norm_ffn, n_embd, n_seq_tokens - 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], 0), + // 1 + // ); + // cur = ggml_add(ctx0, cur, build_rwkv6_channel_mix(layer, x_norm_ffn, x_prev)); + // ggml_build_forward_expand(gf, cur); - token_shift = ggml_concat(ctx0, last_norm_att, last_norm_ffn, 1); + // struct ggml_tensor * last_norm_att = ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_att)); + // struct ggml_tensor * last_norm_ffn = ggml_view_3d(ctx0, x_norm_ffn, n_embd, 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_ffn)); - ggml_build_forward_expand( - gf, - ggml_cpy( - ctx0, - ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * 2, 0), - ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self.k_l[il])) - ) - ); + // token_shift = ggml_concat(ctx0, last_norm_att, last_norm_ffn, 1); - if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) { - cur = ggml_scale(ctx0, cur, 0.5F); - } + // ggml_build_forward_expand( + // gf, + // ggml_cpy( + // ctx0, + // ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * 2, 0), + // ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self.k_l[il])) + // ) + // ); - cur = lctx.cvec.apply_to(ctx0, cur, il); - cb(cur, "l_out", il); + // if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) { + // cur = ggml_scale(ctx0, cur, 0.5F); + // } - // input for next layer - inpL = cur; - } + // cur = lctx.cvec.apply_to(ctx0, cur, il); + // cb(cur, "l_out", il); - cur = inpL; - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); + // // input for next layer + // inpL = cur; + // } - cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM, cb, -1); - cb(cur, "result_norm", -1); + // cur = inpL; + // struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + // cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); + // cur = ggml_get_rows(ctx0, cur, inp_out_ids); - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); - cb(cur, "result_output", -1); + // cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1); + // cb(cur, "result_norm", -1); - ggml_build_forward_expand(gf, cur); + // cur = build_lora_mm(model.output, cur); + // cb(cur, "result_output", -1); - return gf; - } + // ggml_build_forward_expand(gf, cur); + + // return gf; + //} // ref: https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1/blob/main/modeling_rwkv6qwen2.py - ggml_cgraph * build_rwkv6qwen2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + //ggml_cgraph * build_rwkv6qwen2() { + // struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - GGML_ASSERT(n_embd == hparams.n_embd_k_s()); + // GGML_ASSERT(n_embd == hparams.n_embd_k_s()); - const int64_t n_seqs = ubatch.n_seqs; - const int64_t n_seq_tokens = ubatch.n_seq_tokens; - const int64_t n_tokens = ubatch.n_tokens; - GGML_ASSERT(n_seqs != 0); - GGML_ASSERT(ubatch.equal_seqs); - GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs); + // const int64_t n_seqs = ubatch.n_seqs; + // const int64_t n_seq_tokens = ubatch.n_seq_tokens; + // const int64_t n_tokens = ubatch.n_tokens; + // GGML_ASSERT(n_seqs != 0); + // GGML_ASSERT(ubatch.equal_seqs); + // GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - struct ggml_tensor * state_copy = build_inp_s_copy(); - struct ggml_tensor * state_mask = build_inp_s_mask(); + // struct ggml_tensor * cur; + // struct ggml_tensor * inpL; + // struct ggml_tensor * state_copy = build_inp_s_copy(); + // struct ggml_tensor * state_mask = build_inp_s_mask(); - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + // inpL = build_inp_embd(model.tok_embd); - for (int il = 0; il < n_layer; ++il) { - const llama_layer * layer = &model.layers[il]; + // for (int il = 0; il < n_layer; ++il) { + // const llama_layer * layer = &model.layers[il]; - // (ab)using the KV cache to store the states - struct ggml_tensor * token_shift = llm_build_copy_mask_state(ctx0, - gf, kv_self.k_l[il], state_copy, state_mask, - hparams.n_embd_k_s(), kv_self.size, kv_head, n_kv, n_seqs); - struct ggml_tensor * wkv_states = llm_build_copy_mask_state(ctx0, - gf, kv_self.v_l[il], state_copy, state_mask, - hparams.n_embd_v_s(), kv_self.size, kv_head, n_kv, n_seqs); + // // (ab)using the KV cache to store the states + // struct ggml_tensor * token_shift = build_copy_mask_state( + // gf, kv_self.k_l[il], state_copy, state_mask, + // hparams.n_embd_k_s(), n_seqs); - cur = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs); - token_shift = ggml_reshape_3d(ctx0, token_shift, n_embd, 1, n_seqs); + // struct ggml_tensor * wkv_states = build_copy_mask_state( + // gf, kv_self.v_l[il], state_copy, state_mask, + // hparams.n_embd_v_s(), n_seqs); - struct ggml_tensor * x_norm_att = llm_build_norm(ctx0, cur, hparams, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, cb, il); - struct ggml_tensor * x_prev = ggml_concat( - ctx0, - token_shift, - ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0), - 1 - ); + // cur = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs); + // token_shift = ggml_reshape_3d(ctx0, token_shift, n_embd, 1, n_seqs); - ggml_build_forward_expand( - gf, - ggml_cpy( - ctx0, - wkv_states, - ggml_view_1d( - ctx0, - kv_self.v_l[il], - hparams.n_embd_v_s() * n_seqs, - hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il]) - ) - ) - ); + // struct ggml_tensor * x_norm_att = build_norm(cur, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il); + // struct ggml_tensor * x_prev = ggml_concat( + // ctx0, + // token_shift, + // ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0), + // 1 + // ); - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, llm_build_rwkv6_time_mix(lctx, ctx0, layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, hparams.n_head_kv())); - ggml_build_forward_expand(gf, ffn_inp); - ggml_build_forward_expand( - gf, - ggml_cpy( - ctx0, - wkv_states, - ggml_view_1d( - ctx0, - kv_self.v_l[il], - hparams.n_embd_v_s() * n_seqs, - hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il]) - ) - ) - ); + // ggml_build_forward_expand( + // gf, + // ggml_cpy( + // ctx0, + // wkv_states, + // ggml_view_1d( + // ctx0, + // kv_self.v_l[il], + // hparams.n_embd_v_s() * n_seqs, + // hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il]) + // ) + // ) + // ); - cb(ffn_inp, "ffn_inp", il); + // struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, build_rwkv6_time_mix(layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, hparams.n_head_kv())); + // ggml_build_forward_expand(gf, ffn_inp); + // ggml_build_forward_expand( + // gf, + // ggml_cpy( + // ctx0, + // wkv_states, + // ggml_view_1d( + // ctx0, + // kv_self.v_l[il], + // hparams.n_embd_v_s() * n_seqs, + // hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il]) + // ) + // ) + // ); - // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); + // cb(ffn_inp, "ffn_inp", il); - cur = llm_build_ffn(ctx0, lctx, cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); + // // feed-forward network + // cur = build_norm(ffn_inp, + // model.layers[il].ffn_norm, NULL, + // LLM_NORM_RMS, il); + // cb(cur, "ffn_norm", il); - cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); - cb(cur, "l_out", il); + // cur = build_ffn(cur, + // model.layers[il].ffn_up, NULL, NULL, + // model.layers[il].ffn_gate, NULL, NULL, + // model.layers[il].ffn_down, NULL, NULL, + // NULL, + // LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + // cb(cur, "ffn_out", il); - // input for next layer - inpL = cur; - } + // cur = ggml_add(ctx0, cur, ffn_inp); + // cur = lctx.cvec.apply_to(ctx0, cur, il); + // cb(cur, "l_out", il); - cur = inpL; - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); + // // input for next layer + // inpL = cur; + // } - cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM_RMS, cb, -1); - cb(cur, "result_norm", -1); + // cur = inpL; + // struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + // cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); + // cur = ggml_get_rows(ctx0, cur, inp_out_ids); - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); - cb(cur, "result_output", -1); + // cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1); + // cb(cur, "result_norm", -1); - ggml_build_forward_expand(gf, cur); + // cur = build_lora_mm(model.output, cur); + // cb(cur, "result_output", -1); - return gf; - } + // ggml_build_forward_expand(gf, cur); + + // return gf; + //} // ref: https://github.com/facebookresearch/chameleon // based on the original build_llama() function, changes: @@ -7794,13 +7221,12 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -7809,22 +7235,22 @@ struct llm_build_context { if (hparams.swin_norm) { cur = inpL; } else { - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); } // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].attn_q_norm) { @@ -7834,10 +7260,10 @@ struct llm_build_context { 0); cb(Qcur, "Qcur", il); - Qcur = llm_build_norm(ctx0, Qcur, hparams, + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, model.layers[il].attn_q_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(Qcur, "Qcur", il); } @@ -7848,10 +7274,10 @@ struct llm_build_context { 0); cb(Kcur, "Kcur", il); - Kcur = llm_build_norm(ctx0, Kcur, hparams, + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, model.layers[il].attn_k_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(Kcur, "Kcur", il); } @@ -7869,14 +7295,14 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, nullptr, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); if (hparams.swin_norm) { - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); } } @@ -7893,13 +7319,13 @@ struct llm_build_context { // feed-forward network if (!hparams.swin_norm) { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); } - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -7908,9 +7334,9 @@ struct llm_build_context { cb(cur, "ffn_out", il); if (hparams.swin_norm) { - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); } @@ -7926,13 +7352,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output_with_img_logits", -1); // TODO: this suppresses the output of image tokens, which is required to enable text-only outputs. @@ -7959,7 +7385,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); cur = ggml_cont(ctx0, ggml_transpose(ctx0, inpL)); @@ -7978,20 +7404,20 @@ struct llm_build_context { case 3: case 4: { - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, layer.norm1, layer.norm1_b, - LLM_NORM_GROUP, cb, 0); + LLM_NORM_GROUP, 0); cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); cur = ggml_conv_1d_ph(ctx0, layer.conv1, cur, 1, 1); cur = ggml_add(ctx0, cur, layer.conv1_b); - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, layer.norm2, layer.norm2_b, - LLM_NORM_GROUP, cb, 0); + LLM_NORM_GROUP, 0); cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); @@ -8002,10 +7428,10 @@ struct llm_build_context { } break; case 2: { - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, layer.attn_norm, layer.attn_norm_b, - LLM_NORM_GROUP, cb, 0); + LLM_NORM_GROUP, 0); struct ggml_tensor * q; struct ggml_tensor * k; @@ -8035,10 +7461,10 @@ struct llm_build_context { } break; case 5: { - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, layer.norm, layer.norm_b, - LLM_NORM_GROUP, cb, 0); + LLM_NORM_GROUP, 0); } break; default: GGML_ABORT("unknown posnet layer"); }; @@ -8046,10 +7472,10 @@ struct llm_build_context { cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.tok_norm, model.tok_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); @@ -8066,12 +7492,12 @@ struct llm_build_context { cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, layer.norm, layer.norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, layer.pw1, layer.pw1_b, NULL, NULL, NULL, NULL, layer.pw2, layer.pw2_b, NULL, @@ -8089,13 +7515,13 @@ struct llm_build_context { cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, model.output_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cur = ggml_add(ctx0, cur, model.output_b); cb(cur, "result_embd", -1); @@ -8106,7 +7532,7 @@ struct llm_build_context { } }; -static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector & ids) { +static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx) { llama_ubatch dummy = {}; dummy.equal_seqs = true; @@ -8116,7 +7542,7 @@ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const llm.init(); - struct ggml_cgraph * result = llm.build_defrag(ids); + struct ggml_cgraph * result = llm.build_defrag(); llm.free(); @@ -8356,18 +7782,18 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_bitnet(); } break; - case LLM_ARCH_T5: - { - if (lctx.is_encoding) { - result = llm.build_t5_enc(); - } else { - result = llm.build_t5_dec(); - } - } break; - case LLM_ARCH_T5ENCODER: - { - result = llm.build_t5_enc(); - } break; + //case LLM_ARCH_T5: + // { + // if (lctx.is_encoding) { + // result = llm.build_t5_enc(); + // } else { + // result = llm.build_t5_dec(); + // } + // } break; + //case LLM_ARCH_T5ENCODER: + // { + // result = llm.build_t5_enc(); + // } break; case LLM_ARCH_JAIS: { result = llm.build_jais(); @@ -8380,14 +7806,14 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_exaone(); } break; - case LLM_ARCH_RWKV6: - { - result = llm.build_rwkv6(); - } break; - case LLM_ARCH_RWKV6QWEN2: - { - result = llm.build_rwkv6qwen2(); - } break; + //case LLM_ARCH_RWKV6: + // { + // result = llm.build_rwkv6(); + // } break; + //case LLM_ARCH_RWKV6QWEN2: + // { + // result = llm.build_rwkv6qwen2(); + // } break; case LLM_ARCH_CHAMELEON: { result = llm.build_chameleon(); @@ -8543,6 +7969,7 @@ static int llama_decode_impl( } else { ubatch = lctx.sbatch.split_simple(n_ubatch); } + const uint32_t n_tokens = ubatch.n_tokens; // count the outputs in this u_batch @@ -8567,6 +7994,8 @@ static int llama_decode_impl( GGML_ASSERT(n_threads > 0); + lctx.prepare_decode(ubatch); + // non-causal masks do not use the KV cache if (hparams.causal_attn) { llama_kv_self_update(&lctx); // TODO: lctx->kv_self_update() @@ -8600,6 +8029,12 @@ static int llama_decode_impl( ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false); + // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); + + ggml_backend_sched_alloc_graph(lctx.sched.get(), gf); + + lctx.set_inputs(ubatch); + // the output is always the last tensor in the graph struct ggml_tensor * res = ggml_graph_node(gf, -1); struct ggml_tensor * embd = ggml_graph_node(gf, -2); @@ -8623,12 +8058,6 @@ static int llama_decode_impl( GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor"); } - // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); - - ggml_backend_sched_alloc_graph(lctx.sched.get(), gf); - - llama_set_inputs(lctx, ubatch); - const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool); if (compute_status != GGML_STATUS_SUCCESS) { kv_slot_restorer.restore(kv_self); @@ -8850,11 +8279,17 @@ static int llama_encode_impl( GGML_ASSERT(n_threads > 0); + lctx.prepare_decode(ubatch); + ggml_backend_sched_reset(lctx.sched.get()); ggml_backend_sched_set_eval_callback(lctx.sched.get(), lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false); + ggml_backend_sched_alloc_graph(lctx.sched.get(), gf); + + lctx.set_inputs(ubatch); + // the output embeddings after the final encoder normalization struct ggml_tensor * embd = nullptr; @@ -8875,10 +8310,6 @@ static int llama_encode_impl( } } - ggml_backend_sched_alloc_graph(lctx.sched.get(), gf); - - llama_set_inputs(lctx, ubatch); - const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool); switch (compute_status) { case GGML_STATUS_SUCCESS: @@ -8966,227 +8397,6 @@ static int llama_encode_impl( return 0; } -// find holes from the beginning of the KV cache and fill them by moving data from the end of the cache -static void llama_kv_cache_defrag_impl(struct llama_context & lctx) { - auto & kv_self = lctx.kv_self; - - const auto & hparams = lctx.model.hparams; - - const uint32_t n_layer = hparams.n_layer; - - const uint32_t n_kv = kv_self.cell_max(); - const uint32_t n_used = kv_self.used; - - assert(n_used <= n_kv); - - //const int64_t t_start = ggml_time_us(); - - // number of cells moved - uint32_t n_moves = 0; - - // each move requires 6*n_layer tensors (see build_defrag) - // - source view, destination view, copy operation - // - x2 for keys and values - //const uint32_t max_moves = model.max_nodes()/(6*n_layer); - // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516 - const uint32_t max_moves = (lctx.model.max_nodes() - 2*n_layer)/(6*n_layer); - - // determine which KV cells to move where - // - // cell i moves to ids[i] - // - // if ids[i] == i || ids[i] == n_kv, then cell i is not moved - // - std::vector ids(n_kv, n_kv); - - for (uint32_t i0 = 0; i0 < n_used; ++i0) { - const auto & cell0 = kv_self.cells[i0]; - - if (!cell0.is_empty()) { - ids[i0] = i0; - - continue; - } - - // found a hole - fill it with data from the end of the cache - - uint32_t nh = 1; - - // determine the size of the hole - while (i0 + nh < n_used && kv_self.cells[i0 + nh].is_empty()) { - nh++; - } - - uint32_t nf = 0; - uint32_t is = n_kv - 1; - - // starting from the end, find nh non-empty cells - for (; is > i0; --is) { - const auto & cell1 = kv_self.cells[is]; - - if (cell1.is_empty() || ids[is] != n_kv) { - continue; - } - - // non-empty cell which is not yet moved - nf++; - - if (nf == nh) { - break; - } - } - - // this can only happen if `n_used` is not accurate, which would be a bug - GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh"); - - nf = 0; - - uint32_t i1 = is; - - // are we moving a continuous block of memory? - bool cont = false; - - // should we stop searching for the next move? - bool stop = false; - - // go back and move the nf cells to the hole - for (; i1 < n_kv; ++i1) { - auto & cell1 = kv_self.cells[i1]; - - if (cell1.is_empty() || ids[i1] != n_kv) { - if (n_moves == max_moves) { - stop = true; - break; - } - - cont = false; - continue; - } - - // this cell goes to (i0 + nf) - ids[i1] = i0 + nf; - - // move the cell meta data - kv_self.cells[i0 + nf] = cell1; - - // clear the old cell and move the head there - cell1 = llama_kv_cell(); - kv_self.head = n_used; - - if (!cont) { - n_moves++; - cont = true; - } - - nf++; - - if (nf == nh) { - break; - } - } - - if (stop || n_moves == max_moves) { - break; - } - - //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh); - - i0 += nh - 1; - } - - if (n_moves == 0) { - return; - } - - //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves); - - //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer); - -#if 0 - // CPU defrag - // - // TODO: optimizations are possible: - // - multiple threads - // - avoid copying to the host memory when already there - // - // likely not worth the effort, as we have ggml_graph based defrag - // - - const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); - - const uint32_t kv_size = kv_self.size; - - std::vector buf_k; - std::vector buf_v; - - for (uint32_t il = 0; il < n_layer; ++il) { - const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa); - const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_size); - - const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type); - const size_t v_size = ggml_row_size (kv_self.v_l[il]->type, n_embd_v_gqa*kv_size); - - buf_k.resize(k_size); - buf_v.resize(v_size); - - ggml_backend_tensor_get(kv_self.k_l[il], buf_k.data(), 0, buf_k.size()); - ggml_backend_tensor_get(kv_self.v_l[il], buf_v.data(), 0, buf_v.size()); - - // batch move [i, i+nm) to [id, id+nm) - // note: cells can move only to a lower index - for (uint32_t i = 0; i < n_kv; ++i) { - const uint32_t id = ids[i]; - - if (i == id || id == n_kv) { - continue; - } - - uint32_t nm = 1; - - while (i + nm < n_kv && ids[i + nm] == id + nm) { - nm++; - } - - // move keys - { - const int64_t os = i*k_size_row; - const int64_t od = id*k_size_row; - - memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row); - } - - // move values (note: they are transposed) - { - const int64_t os = i; - const int64_t od = id; - - for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { - memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el); - } - } - - i += nm - 1; - } - - ggml_backend_tensor_set(kv_self.k_l[il], buf_k.data(), 0, buf_k.size()); - ggml_backend_tensor_set(kv_self.v_l[il], buf_v.data(), 0, buf_v.size()); - } -#else - // ggml_graph defrag - - ggml_backend_sched_reset(lctx.sched.get()); - - ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids); - - llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool); -#endif - - //const int64_t t_end = ggml_time_us(); - - //LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0); -} - // TODO: move to llama_context static void llama_kv_self_update_impl(llama_context & lctx) { bool need_reserve = false; @@ -9200,13 +8410,15 @@ static void llama_kv_self_update_impl(llama_context & lctx) { // apply K-shift if needed if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) { + lctx.prepare_k_shift(); + ggml_backend_sched_reset(lctx.sched.get()); ggml_cgraph * gf = llama_build_graph_k_shift(lctx); ggml_backend_sched_alloc_graph(lctx.sched.get(), gf); - lctx.set_k_shift(kv); + lctx.set_inputs({}); llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool); @@ -9224,7 +8436,13 @@ static void llama_kv_self_update_impl(llama_context & lctx) { // defragment the KV cache if needed if (kv.do_defrag) { - llama_kv_cache_defrag_impl(lctx); + lctx.prepare_defrag(); + + ggml_backend_sched_reset(lctx.sched.get()); + + ggml_cgraph * gf = llama_build_graph_defrag(lctx); + + llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool); need_reserve = true; @@ -9253,16 +8471,16 @@ int32_t llama_set_adapter_lora( struct llama_context * ctx, struct llama_adapter_lora * adapter, float scale) { - ctx->lora[adapter] = scale; + ctx->loras[adapter] = scale; return 0; } int32_t llama_rm_adapter_lora( struct llama_context * ctx, struct llama_adapter_lora * adapter) { - auto pos = ctx->lora.find(adapter); - if (pos != ctx->lora.end()) { - ctx->lora.erase(pos); + auto pos = ctx->loras.find(adapter); + if (pos != ctx->loras.end()) { + ctx->loras.erase(pos); return 0; } @@ -9270,7 +8488,7 @@ int32_t llama_rm_adapter_lora( } void llama_clear_adapter_lora(struct llama_context * ctx) { - ctx->lora.clear(); + ctx->loras.clear(); } int32_t llama_apply_adapter_cvec(