llama : use llm_build_lora_mm in most model graphs

This commit is contained in:
Francis Couture-Harpin 2024-07-15 03:23:19 -04:00
parent 8956543c09
commit 87301bdd59

View File

@ -8945,13 +8945,13 @@ struct llm_build_context {
// self-attention // self-attention
{ {
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
switch (model.type) { switch (model.type) {
@ -9024,7 +9024,7 @@ struct llm_build_context {
cb(cur, "result_norm", -1); cb(cur, "result_norm", -1);
// lm_head // lm_head
cur = ggml_mul_mat(ctx0, model.output, cur); cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1); cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
@ -9060,13 +9060,13 @@ struct llm_build_context {
// self-attention // self-attention
{ {
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
Qcur = ggml_rope_ext( Qcur = ggml_rope_ext(
@ -9127,7 +9127,7 @@ struct llm_build_context {
cb(cur, "result_norm", -1); cb(cur, "result_norm", -1);
// lm_head // lm_head
cur = ggml_mul_mat(ctx0, model.output, cur); cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1); cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
@ -9176,7 +9176,7 @@ struct llm_build_context {
cur = attn_norm; cur = attn_norm;
} }
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
cb(cur, "wqkv", il); cb(cur, "wqkv", il);
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
@ -9247,7 +9247,7 @@ struct llm_build_context {
LLM_NORM, cb, -1); LLM_NORM, cb, -1);
cb(cur, "result_norm", -1); cb(cur, "result_norm", -1);
cur = ggml_mul_mat(ctx0, model.output, cur); cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1); cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
@ -9292,21 +9292,21 @@ struct llm_build_context {
// self-attention // self-attention
{ {
// compute Q and K and RoPE them // compute Q and K and RoPE them
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
if (model.layers[il].bq) { if (model.layers[il].bq) {
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
} }
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
if (model.layers[il].bk) { if (model.layers[il].bk) {
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
} }
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
if (model.layers[il].bv) { if (model.layers[il].bv) {
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@ -9398,7 +9398,7 @@ struct llm_build_context {
cb(cur, "result_norm", -1); cb(cur, "result_norm", -1);
// lm_head // lm_head
cur = ggml_mul_mat(ctx0, model.output, cur); cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
// Grok // Grok
// multiply logits by output_multiplier_scale of 0.5773502691896257 // multiply logits by output_multiplier_scale of 0.5773502691896257
@ -9449,7 +9449,7 @@ struct llm_build_context {
struct ggml_tensor * Kcur = nullptr; struct ggml_tensor * Kcur = nullptr;
struct ggml_tensor * Vcur = nullptr; struct ggml_tensor * Vcur = nullptr;
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
cb(cur, "wqkv", il); cb(cur, "wqkv", il);
cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
@ -9529,7 +9529,7 @@ struct llm_build_context {
cb(cur, "result_norm", -1); cb(cur, "result_norm", -1);
// lm_head // lm_head
cur = ggml_mul_mat(ctx0, model.output, cur); cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1); cb(cur, "result_output", -1);
@ -9571,7 +9571,7 @@ struct llm_build_context {
// self-attention // self-attention
{ {
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
cb(cur, "wqkv", il); cb(cur, "wqkv", il);
cur = ggml_add(ctx0, cur, model.layers[il].bqkv); cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
@ -9634,7 +9634,7 @@ struct llm_build_context {
LLM_NORM, cb, -1); LLM_NORM, cb, -1);
cb(cur, "result_norm", -1); cb(cur, "result_norm", -1);
cur = ggml_mul_mat(ctx0, model.output, cur); cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1); cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
@ -9666,13 +9666,13 @@ struct llm_build_context {
// self-attention // self-attention
{ {
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
@ -9728,7 +9728,7 @@ struct llm_build_context {
cb(cur, "result_norm", -1); cb(cur, "result_norm", -1);
// lm_head // lm_head
cur = ggml_mul_mat(ctx0, model.output, cur); cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1); cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
@ -9780,7 +9780,7 @@ struct llm_build_context {
// self-attention // self-attention
if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) { if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) {
Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq); Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur), model.layers[il].bq);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
if (model.layers[il].attn_q_norm) { if (model.layers[il].attn_q_norm) {
@ -9790,7 +9790,7 @@ struct llm_build_context {
LLM_NORM, cb, il); LLM_NORM, cb, il);
} }
Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk); Kcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur), model.layers[il].bk);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
if (model.layers[il].attn_k_norm) { if (model.layers[il].attn_k_norm) {
@ -9799,14 +9799,14 @@ struct llm_build_context {
model.layers[il].attn_k_norm_b, model.layers[il].attn_k_norm_b,
LLM_NORM, cb, il); LLM_NORM, cb, il);
} }
Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv); Vcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur), model.layers[il].bv);
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
} else { } else {
// compute Q and K and RoPE them // compute Q and K and RoPE them
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
cb(cur, "wqkv", il); cb(cur, "wqkv", il);
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
@ -9855,7 +9855,7 @@ struct llm_build_context {
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
if (model.layers[il].bo) { if (model.layers[il].bo) {
cb(cur, "kqv_wo", il); cb(cur, "kqv_wo", il);
} }
@ -9960,7 +9960,7 @@ struct llm_build_context {
// self-attention // self-attention
{ {
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
cb(cur, "wqkv", il); cb(cur, "wqkv", il);
cur = ggml_add(ctx0, cur, model.layers[il].bqkv); cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
@ -10023,7 +10023,7 @@ struct llm_build_context {
LLM_NORM, cb, -1); LLM_NORM, cb, -1);
cb(cur, "result_norm", -1); cb(cur, "result_norm", -1);
cur = ggml_mul_mat(ctx0, model.output, cur); cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1); cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
@ -10070,7 +10070,7 @@ struct llm_build_context {
{ {
cur = attn_norm; cur = attn_norm;
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
cb(cur, "wqkv", il); cb(cur, "wqkv", il);
if (model.layers[il].bqkv){ if (model.layers[il].bqkv){
@ -10163,7 +10163,7 @@ struct llm_build_context {
LLM_NORM, cb, -1); LLM_NORM, cb, -1);
cb(cur, "result_norm", -1); cb(cur, "result_norm", -1);
cur = ggml_mul_mat(ctx0, model.output, cur); cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1); cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
@ -10203,21 +10203,21 @@ struct llm_build_context {
// self-attention // self-attention
{ {
// compute Q and K and RoPE them // compute Q and K and RoPE them
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
if (model.layers[il].bq) { if (model.layers[il].bq) {
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
} }
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
if (model.layers[il].bk) { if (model.layers[il].bk) {
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
} }
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
if (model.layers[il].bv) { if (model.layers[il].bv) {
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@ -10313,7 +10313,7 @@ struct llm_build_context {
cb(cur, "result_norm", -1); cb(cur, "result_norm", -1);
// lm_head // lm_head
cur = ggml_mul_mat(ctx0, model.output, cur); cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1); cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
@ -10348,7 +10348,7 @@ struct llm_build_context {
// self-attention // self-attention
{ {
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
cb(cur, "wqkv", il); cb(cur, "wqkv", il);
cur = ggml_add(ctx0, cur, model.layers[il].bqkv); cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
@ -10425,7 +10425,7 @@ struct llm_build_context {
cb(cur, "result_norm", -1); cb(cur, "result_norm", -1);
// lm_head // lm_head
cur = ggml_mul_mat(ctx0, model.output, cur); cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1); cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
@ -10463,17 +10463,17 @@ struct llm_build_context {
// self-attention // self-attention
{ {
// compute Q and K and RoPE them // compute Q and K and RoPE them
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
@ -10537,7 +10537,7 @@ struct llm_build_context {
cb(cur, "result_norm", -1); cb(cur, "result_norm", -1);
// lm_head // lm_head
cur = ggml_mul_mat(ctx0, model.output, cur); cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1); cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
@ -10578,17 +10578,17 @@ struct llm_build_context {
// self_attention // self_attention
{ {
// compute Q and K and RoPE them // compute Q and K and RoPE them
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
@ -10643,7 +10643,7 @@ struct llm_build_context {
// FFN shared expert // FFN shared expert
{ {
ggml_tensor * cur_gate_inp = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp_shexp, cur); ggml_tensor * cur_gate_inp = llm_build_lora_mm(lctx, ctx0, model.layers[il].ffn_gate_inp_shexp, cur);
cb(cur_gate_inp, "ffn_shexp_gate_inp", il); cb(cur_gate_inp, "ffn_shexp_gate_inp", il);
// sigmoid // sigmoid
@ -10683,7 +10683,7 @@ struct llm_build_context {
cb(cur, "result_norm", -1); cb(cur, "result_norm", -1);
// lm_head // lm_head
cur = ggml_mul_mat(ctx0, model.output, cur); cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1); cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
@ -10725,7 +10725,7 @@ struct llm_build_context {
struct ggml_tensor * Vcur = nullptr; struct ggml_tensor * Vcur = nullptr;
if (model.layers[il].wqkv) { if (model.layers[il].wqkv) {
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output); cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, attn_norm_output);
cb(cur, "wqkv", il); cb(cur, "wqkv", il);
cur = ggml_add(ctx0, cur, model.layers[il].bqkv); cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
@ -10735,9 +10735,9 @@ struct llm_build_context {
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
} else { } else {
Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq); Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk); Kcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv); Vcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
} }
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
@ -10803,7 +10803,7 @@ struct llm_build_context {
LLM_NORM, cb, -1); LLM_NORM, cb, -1);
cb(cur, "result_norm", -1); cb(cur, "result_norm", -1);
cur = ggml_mul_mat(ctx0, model.output, cur); cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output_no_bias", -1); cb(cur, "result_output_no_bias", -1);
cur = ggml_add(ctx0, cur, model.output_b); cur = ggml_add(ctx0, cur, model.output_b);
@ -10849,7 +10849,7 @@ struct llm_build_context {
struct ggml_tensor * Vcur = nullptr; struct ggml_tensor * Vcur = nullptr;
if (model.layers[il].wqkv) { if (model.layers[il].wqkv) {
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output); cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, attn_norm_output);
cb(cur, "wqkv", il); cb(cur, "wqkv", il);
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd))); Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd)));
@ -10857,9 +10857,9 @@ struct llm_build_context {
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa))); Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
} }
else { else {
Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq); Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk); Kcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv); Vcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
} }
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
@ -10931,7 +10931,7 @@ struct llm_build_context {
LLM_NORM_RMS, cb, -1); LLM_NORM_RMS, cb, -1);
cb(cur, "result_norm", -1); cb(cur, "result_norm", -1);
cur = ggml_mul_mat(ctx0, model.output, cur); cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1); cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
@ -10971,13 +10971,13 @@ struct llm_build_context {
// self-attention // self-attention
{ {
// compute Q and K and RoPE them // compute Q and K and RoPE them
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
Qcur = ggml_rope_ext( Qcur = ggml_rope_ext(
@ -11036,7 +11036,7 @@ struct llm_build_context {
cb(cur, "result_norm", -1); cb(cur, "result_norm", -1);
// lm_head // lm_head
cur = ggml_mul_mat(ctx0, model.output, cur); cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1); cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
@ -11078,7 +11078,7 @@ struct llm_build_context {
// self-attention // self-attention
{ {
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
cb(cur, "wqkv", il); cb(cur, "wqkv", il);
cur = ggml_add(ctx0, cur, model.layers[il].bqkv); cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
@ -11141,7 +11141,7 @@ struct llm_build_context {
LLM_NORM, cb, -1); LLM_NORM, cb, -1);
cb(cur, "result_norm", -1); cb(cur, "result_norm", -1);
cur = ggml_mul_mat(ctx0, model.output, cur); cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1); cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
@ -11177,7 +11177,7 @@ struct llm_build_context {
// self-attention // self-attention
{ {
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
cb(cur, "wqkv", il); cb(cur, "wqkv", il);
cur = ggml_add(ctx0, cur, model.layers[il].bqkv); cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
@ -11252,7 +11252,7 @@ struct llm_build_context {
LLM_NORM, cb, -1); LLM_NORM, cb, -1);
cb(cur, "result_norm", -1); cb(cur, "result_norm", -1);
cur = ggml_mul_mat(ctx0, model.output, cur); cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1); cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
@ -11290,21 +11290,21 @@ struct llm_build_context {
// self-attention // self-attention
{ {
// compute Q and K and RoPE them // compute Q and K and RoPE them
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
// if (model.layers[il].bq) { // if (model.layers[il].bq) {
// Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); // Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
// cb(Qcur, "Qcur", il); // cb(Qcur, "Qcur", il);
// } // }
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
// if (model.layers[il].bk) { // if (model.layers[il].bk) {
// Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); // Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
// cb(Kcur, "Kcur", il); // cb(Kcur, "Kcur", il);
// } // }
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
// if (model.layers[il].bv) { // if (model.layers[il].bv) {
// Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); // Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@ -11370,7 +11370,7 @@ struct llm_build_context {
cb(cur, "result_norm", -1); cb(cur, "result_norm", -1);
// lm_head // lm_head
cur = ggml_mul_mat(ctx0, model.output, cur); cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1); cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
@ -11408,21 +11408,21 @@ struct llm_build_context {
// self-attention // self-attention
{ {
// compute Q and K and RoPE them // compute Q and K and RoPE them
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
if (model.layers[il].bq) { if (model.layers[il].bq) {
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
} }
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
if (model.layers[il].bk) { if (model.layers[il].bk) {
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
} }
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
if (model.layers[il].bv) { if (model.layers[il].bv) {
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@ -11488,7 +11488,7 @@ struct llm_build_context {
cb(cur, "result_norm", -1); cb(cur, "result_norm", -1);
// lm_head // lm_head
cur = ggml_mul_mat(ctx0, model.output, cur); cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1); cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
@ -11539,21 +11539,21 @@ struct llm_build_context {
// self-attention // self-attention
{ {
// compute Q and K and RoPE them // compute Q and K and RoPE them
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
if (model.layers[il].bq) { if (model.layers[il].bq) {
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
} }
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
if (model.layers[il].bk) { if (model.layers[il].bk) {
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
} }
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
if (model.layers[il].bv) { if (model.layers[il].bv) {
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@ -11635,7 +11635,7 @@ struct llm_build_context {
cb(cur, "lmhead_scaling", -1); cb(cur, "lmhead_scaling", -1);
// lm_head // lm_head
cur = ggml_mul_mat(ctx0, model.output, cur); cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1); cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
@ -11672,13 +11672,13 @@ struct llm_build_context {
// self-attention // self-attention
{ {
// compute Q and K and RoPE them // compute Q and K and RoPE them
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
Qcur = ggml_rope_ext( Qcur = ggml_rope_ext(
@ -11743,7 +11743,7 @@ struct llm_build_context {
cb(cur, "result_norm", -1); cb(cur, "result_norm", -1);
// lm_head // lm_head
cur = ggml_mul_mat(ctx0, model.output, cur); cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1); cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
@ -11785,13 +11785,13 @@ struct llm_build_context {
// self-attention // self-attention
{ {
// compute Q and K and RoPE them // compute Q and K and RoPE them
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
Qcur = ggml_rope_ext( Qcur = ggml_rope_ext(
@ -11866,7 +11866,7 @@ struct llm_build_context {
cb(cur, "result_norm", -1); cb(cur, "result_norm", -1);
// lm_head // lm_head
cur = ggml_mul_mat(ctx0, model.output, cur); cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
// final logit soft-capping // final logit soft-capping
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping); cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
@ -11911,21 +11911,21 @@ struct llm_build_context {
// self-attention // self-attention
{ {
// compute Q and K and RoPE them // compute Q and K and RoPE them
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
if (model.layers[il].bq) { if (model.layers[il].bq) {
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
} }
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
if (model.layers[il].bk) { if (model.layers[il].bk) {
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
} }
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
if (model.layers[il].bv) { if (model.layers[il].bv) {
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@ -11992,7 +11992,7 @@ struct llm_build_context {
cb(cur, "result_norm", -1); cb(cur, "result_norm", -1);
// lm_head // lm_head
cur = ggml_mul_mat(ctx0, model.output, cur); cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1); cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
@ -12044,7 +12044,7 @@ struct llm_build_context {
cb(cur, "attn_norm", il); cb(cur, "attn_norm", il);
// {n_embd, 2*d_inner} * {n_embd, n_tokens} => {2*d_inner, n_tokens} // {n_embd, 2*d_inner} * {n_embd, n_tokens} => {2*d_inner, n_tokens}
struct ggml_tensor * xz = ggml_mul_mat(ctx0, model.layers[il].ssm_in, cur); struct ggml_tensor * xz = llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_in, cur);
// split the above in two // split the above in two
// => {d_inner, n_tokens} // => {d_inner, n_tokens}
struct ggml_tensor * x = ggml_view_2d(ctx0, xz, d_inner, xz->ne[1], xz->nb[1], 0); struct ggml_tensor * x = ggml_view_2d(ctx0, xz, d_inner, xz->ne[1], xz->nb[1], 0);
@ -12083,14 +12083,14 @@ struct llm_build_context {
// ssm // ssm
{ {
// {d_inner, dt_rank + 2*d_state} * {d_inner, n_tokens} => {dt_rank + 2*d_state, n_tokens} // {d_inner, dt_rank + 2*d_state} * {d_inner, n_tokens} => {dt_rank + 2*d_state, n_tokens}
struct ggml_tensor * x_db = ggml_mul_mat(ctx0, model.layers[il].ssm_x, x); struct ggml_tensor * x_db = llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_x, x);
// split // split
struct ggml_tensor * dt = ggml_view_2d(ctx0, x_db, dt_rank, n_tokens, x_db->nb[1], 0); struct ggml_tensor * dt = ggml_view_2d(ctx0, x_db, dt_rank, n_tokens, x_db->nb[1], 0);
struct ggml_tensor * B = ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], ggml_element_size(x_db)*dt_rank); struct ggml_tensor * B = ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], ggml_element_size(x_db)*dt_rank);
struct ggml_tensor * C = ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], ggml_element_size(x_db)*(dt_rank+d_state)); struct ggml_tensor * C = ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], ggml_element_size(x_db)*(dt_rank+d_state));
// {dt_rank, d_inner} * {dt_rank, n_tokens} => {d_inner, n_tokens} // {dt_rank, d_inner} * {dt_rank, n_tokens} => {d_inner, n_tokens}
dt = ggml_mul_mat(ctx0, model.layers[il].ssm_dt, dt); dt = llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_dt, dt);
dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b); dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b);
// Custom operator to optimize the parallel associative scan // Custom operator to optimize the parallel associative scan
@ -12121,7 +12121,7 @@ struct llm_build_context {
y = ggml_mul(ctx0, y, ggml_silu(ctx0, z)); y = ggml_mul(ctx0, y, ggml_silu(ctx0, z));
// {d_inner, n_embd} * {d_inner, n_tokens} => {n_embd, n_tokens} // {d_inner, n_embd} * {d_inner, n_tokens} => {n_embd, n_tokens}
cur = ggml_mul_mat(ctx0, model.layers[il].ssm_out, y); cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_out, y);
} }
// residual // residual
@ -12140,7 +12140,7 @@ struct llm_build_context {
cb(cur, "result_norm", -1); cb(cur, "result_norm", -1);
// lm_head // lm_head
cur = ggml_mul_mat(ctx0, model.output, cur); cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1); cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
@ -12179,21 +12179,21 @@ struct llm_build_context {
// self-attention // self-attention
{ {
// compute Q and K and RoPE them // compute Q and K and RoPE them
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
if (model.layers[il].bq) { if (model.layers[il].bq) {
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
} }
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
if (model.layers[il].bk) { if (model.layers[il].bk) {
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
} }
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
if (model.layers[il].bv) { if (model.layers[il].bv) {
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@ -12283,7 +12283,7 @@ struct llm_build_context {
cb(cur, "result_norm", -1); cb(cur, "result_norm", -1);
// lm_head // lm_head
cur = ggml_mul_mat(ctx0, model.output, cur); cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
if (f_logit_scale) { if (f_logit_scale) {
cur = ggml_scale(ctx0, cur, f_logit_scale); cur = ggml_scale(ctx0, cur, f_logit_scale);
@ -12336,21 +12336,21 @@ struct llm_build_context {
// self-attention // self-attention
{ {
// compute Q and K and RoPE them // compute Q and K and RoPE them
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
if (hparams.f_clamp_kqv > 0.0f) { if (hparams.f_clamp_kqv > 0.0f) {
Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
} }
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
if (hparams.f_clamp_kqv > 0.0f) { if (hparams.f_clamp_kqv > 0.0f) {
Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
} }
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
if (hparams.f_clamp_kqv > 0.0f) { if (hparams.f_clamp_kqv > 0.0f) {
Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
@ -12419,7 +12419,7 @@ struct llm_build_context {
cb(cur, "result_norm", -1); cb(cur, "result_norm", -1);
// lm_head // lm_head
cur = ggml_mul_mat(ctx0, model.output, cur); cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1); cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
@ -12459,7 +12459,7 @@ struct llm_build_context {
// self-attention // self-attention
{ {
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
cb(cur, "wqkv", il); cb(cur, "wqkv", il);
cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens); cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens);
@ -12544,7 +12544,7 @@ struct llm_build_context {
LLM_NORM_RMS, cb, -1); LLM_NORM_RMS, cb, -1);
cb(cur, "result_norm", -1); cb(cur, "result_norm", -1);
cur = ggml_mul_mat(ctx0, model.output, cur); cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1); cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
@ -12579,7 +12579,7 @@ struct llm_build_context {
// self-attention // self-attention
{ {
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
cb(cur, "wqkv", il); cb(cur, "wqkv", il);
cur = ggml_add(ctx0, cur, model.layers[il].bqkv); cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
@ -12686,7 +12686,7 @@ struct llm_build_context {
LLM_NORM, cb, -1); LLM_NORM, cb, -1);
cb(cur, "result_norm", -1); cb(cur, "result_norm", -1);
cur = ggml_mul_mat(ctx0, model.output, cur); cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1); cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
@ -12727,13 +12727,13 @@ struct llm_build_context {
// self-attention // self-attention
{ {
// compute Q and K and RoPE them // compute Q and K and RoPE them
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
Qcur = ggml_rope_ext( Qcur = ggml_rope_ext(
@ -12818,7 +12818,7 @@ struct llm_build_context {
cb(cur, "result_norm", -1); cb(cur, "result_norm", -1);
// lm_head // lm_head
cur = ggml_mul_mat(ctx0, model.output, cur); cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1); cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
@ -13087,7 +13087,7 @@ struct llm_build_context {
// self-attention // self-attention
{ {
// compute Q and K and RoPE them // compute Q and K and RoPE them
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale); Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
if (model.layers[il].bq) { if (model.layers[il].bq) {
@ -13096,7 +13096,7 @@ struct llm_build_context {
} }
// B1.K // B1.K
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale); Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
if (model.layers[il].bk) { if (model.layers[il].bk) {
@ -13105,7 +13105,7 @@ struct llm_build_context {
} }
// B1.V // B1.V
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale); Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
if (model.layers[il].bv) { if (model.layers[il].bv) {
@ -13136,7 +13136,7 @@ struct llm_build_context {
LLM_NORM_RMS, cb, il); LLM_NORM_RMS, cb, il);
cb(cur, "attn_sub_norm", il); cb(cur, "attn_sub_norm", il);
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale); cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale);
if (model.layers[il].bo) { if (model.layers[il].bo) {
cur = ggml_add(ctx0, cur, model.layers[il].bo); cur = ggml_add(ctx0, cur, model.layers[il].bo);
@ -13173,7 +13173,7 @@ struct llm_build_context {
LLM_NORM_RMS, cb, il); LLM_NORM_RMS, cb, il);
cb(cur, "ffn_sub_norm", il); cb(cur, "ffn_sub_norm", il);
cur = ggml_mul_mat(ctx0, model.layers[il].ffn_down, cur); cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].ffn_down, cur);
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale); cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
cb(cur, "ffn_down", il); cb(cur, "ffn_down", il);
@ -13192,7 +13192,7 @@ struct llm_build_context {
cb(cur, "result_norm", -1); cb(cur, "result_norm", -1);
// lm_head // lm_head
cur = ggml_mul_mat(ctx0, model.tok_embd, cur); cur = llm_build_lora_mm(lctx, ctx0, model.tok_embd, cur);
cb(cur, "result_output", -1); cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
@ -13540,7 +13540,7 @@ struct llm_build_context {
// self-attention // self-attention
{ {
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
cb(cur, "wqkv", il); cb(cur, "wqkv", il);
cur = ggml_add(ctx0, cur, model.layers[il].bqkv); cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
@ -13599,7 +13599,7 @@ struct llm_build_context {
LLM_NORM, cb, -1); LLM_NORM, cb, -1);
cb(cur, "result_norm", -1); cb(cur, "result_norm", -1);
cur = ggml_mul_mat(ctx0, model.output, cur); cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1); cb(cur, "result_output", -1);
@ -13641,7 +13641,7 @@ struct llm_build_context {
struct ggml_tensor * Kcur = nullptr; struct ggml_tensor * Kcur = nullptr;
struct ggml_tensor * Vcur = nullptr; struct ggml_tensor * Vcur = nullptr;
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
cb(cur, "wqkv", il); cb(cur, "wqkv", il);
cur = ggml_add(ctx0, cur, model.layers[il].bqkv); cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
@ -13714,7 +13714,7 @@ struct llm_build_context {
LLM_NORM_RMS, cb, -1); LLM_NORM_RMS, cb, -1);
cb(cur, "result_norm", -1); cb(cur, "result_norm", -1);
cur = ggml_mul_mat(ctx0, model.output, cur); cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1); cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);