From f6d090d7de2544be6a508d53630e791d9ce0751f Mon Sep 17 00:00:00 2001 From: ngxson Date: Sun, 7 Jul 2024 16:01:05 +0200 Subject: [PATCH] add llm_build_mm --- common/common.cpp | 4 +- ggml/src/ggml.c | 2 +- include/llama.h | 24 ++- src/llama.cpp | 469 ++++++++++++++++++++-------------------------- 4 files changed, 221 insertions(+), 278 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index d5dd4d38d..ec5709f83 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2063,14 +2063,14 @@ std::tuple llama_init_from_gpt_par for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]); float lora_scale = std::get<1>(params.lora_adapter[i]); - auto adapter = llama_lora_adapter_init(lctx, lora_adapter.c_str(), lora_scale); + auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str()); if (adapter == nullptr) { fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); llama_free(lctx); llama_free_model(model); return std::make_tuple(nullptr, nullptr); } - llama_lora_adapter_apply(lctx, adapter); + llama_lora_adapter_set(lctx, adapter, lora_scale); } if (params.ignore_eos) { diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 2e09b7087..2093be2a9 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -19339,7 +19339,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph fprintf(fp, "digraph G {\n"); fprintf(fp, " newrank = true;\n"); - fprintf(fp, " rankdir = LR;\n"); + fprintf(fp, " rankdir = TB;\n"); for (int i = 0; i < gb->n_nodes; i++) { struct ggml_tensor * node = gb->nodes[i]; diff --git a/include/llama.h b/include/llama.h index 50ea0d847..37140b771 100644 --- a/include/llama.h +++ b/include/llama.h @@ -508,19 +508,29 @@ extern "C" { const char * fname_out, const llama_model_quantize_params * params); - // Apply a LoRA adapter to a loaded model - // path_base_model is the path to a higher quality model to use as a base for - // the layers modified by the adapter. Can be NULL to use the current loaded model. - // The model needs to be reloaded before applying a new adapter, otherwise the adapter - // will be applied on top of the previous one + // Load a LoRA adapter from file + // The loaded adapter will be associated to the given model, and will be free when the model is deleted LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init( + struct llama_model * model, + const char * path_lora); + + // Add a loaded LoRA adapter to given context + // This will not modify model's weight + LLAMA_API int32_t llama_lora_adapter_set( struct llama_context * ctx, - const char * path_lora, + struct llama_lora_adapter * adapter, float scale); - LLAMA_API int32_t llama_lora_adapter_apply( + + // Remove a LoRA adapter from given context + // Return -1 if the adapter is not present in the context + LLAMA_API int32_t llama_lora_adapter_remove( struct llama_context * ctx, struct llama_lora_adapter * adapter); + // Manually free a LoRA adapter + // Note: loaded adapters will be free when the associated model is deleted + LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter); + // Apply a loaded control vector to a llama_context, or if data is NULL, clear // the currently loaded vector. // n_embd should be the size of a single layer's control, and data should point diff --git a/src/llama.cpp b/src/llama.cpp index 5f02106d3..ee18ca847 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2547,30 +2547,6 @@ struct llama_control_vector { } }; -struct lora_weight { - struct ggml_tensor * a = nullptr; - struct ggml_tensor * b = nullptr; - lora_weight() {} - lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {} -}; - -struct llama_lora_adapter { - // map tensor name to lora_a_b - std::map ab_map; - std::vector ctxs; - std::vector bufs; - float scale = 1.0f; - - ~llama_lora_adapter() { - for (struct ggml_context * ctx : ctxs) { - ggml_free(ctx); - } - for (ggml_backend_buffer_t buf : bufs) { - ggml_backend_buffer_free(buf); - } - } -}; - struct llama_vocab { using id = int32_t; using token = std::string; @@ -2703,9 +2679,8 @@ struct llama_model { int64_t t_load_us = 0; int64_t t_start_us = 0; - // used by lora, to save model's original tensors - std::vector orig_tensors; - std::vector orig_layers; + // keep track of loaded lora adapters + std::set lora_adapters; ~llama_model() { for (struct ggml_context * ctx : ctxs) { @@ -2719,6 +2694,9 @@ struct llama_model { #endif ggml_backend_buffer_free(buf); } + while (!lora_adapters.empty()) { + llama_lora_adapter_free(*lora_adapters.begin()); + } } }; @@ -2732,10 +2710,6 @@ struct llama_context { } ggml_backend_buffer_free(buf_output); - - for (auto adapter : lora_adapters) { - delete adapter; - } } llama_cparams cparams; @@ -2828,8 +2802,50 @@ struct llama_context { // control vectors struct llama_control_vector cvec; - // lora adapters - std::vector lora_adapters; + // lora adapters and scales + std::map lora_adapters; +}; + +struct lora_weight { + struct ggml_tensor * a = nullptr; + struct ggml_tensor * b = nullptr; + lora_weight() {} + lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {} +}; + +struct llama_lora_adapter { + struct llama_model * base_model; + // map tensor name to lora_a_b + std::map ab_map; + std::vector ctxs; + std::vector bufs; + + llama_lora_adapter(struct llama_model * base_model): base_model(base_model) { + base_model->lora_adapters.insert(this); + } + + bool has_weight(struct ggml_tensor * w) { + std::string name(w->name); + return ab_map.find(name) != ab_map.end(); + } + + lora_weight & get_weight(struct ggml_tensor * w) { + std::string name(w->name); + return ab_map.at(name); + } + + ~llama_lora_adapter() { + for (struct ggml_context * ctx : ctxs) { + ggml_free(ctx); + } + for (ggml_backend_buffer_t buf : bufs) { + ggml_backend_buffer_free(buf); + } + auto pos = base_model->lora_adapters.find(this); + if (pos != base_model->lora_adapters.end()) { + base_model->lora_adapters.erase(pos); + } + } }; static size_t llama_get_device_count(const llama_model & model) { @@ -7773,6 +7789,32 @@ static void llm_build_kv_store( ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view)); } +// do mat_mul, while optionally apply lora +static struct ggml_tensor * llm_build_mm( + struct llama_context & lctx, + struct ggml_context * ctx0, + struct ggml_tensor * w, + struct ggml_tensor * cur) { + struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur); + for (auto & it : lctx.lora_adapters) { + struct llama_lora_adapter * adapter = it.first; + float scale = it.second; + if (!adapter->has_weight(w)) { + continue; + } + struct lora_weight & lora = adapter->get_weight(w); + // TODO: check if lora_a need transpose + struct ggml_tensor * a = ggml_cont(ctx0, ggml_transpose(ctx0, lora.a)); + struct ggml_tensor * ab_cur = ggml_mul_mat( + ctx0, lora.b, + ggml_mul_mat(ctx0, a, cur) + ); + ab_cur = ggml_scale_inplace(ctx0, ab_cur, scale); + res = ggml_add(ctx0, res, ab_cur); + } + return res; +} + static struct ggml_tensor * llm_build_norm( struct ggml_context * ctx, struct ggml_tensor * cur, @@ -7806,6 +7848,7 @@ static struct ggml_tensor * llm_build_norm( } static struct ggml_tensor * llm_build_ffn( + struct llama_context & lctx, struct ggml_context * ctx, struct ggml_tensor * cur, struct ggml_tensor * up, @@ -7822,7 +7865,7 @@ static struct ggml_tensor * llm_build_ffn( llm_ffn_gate_type type_gate, const llm_build_cb & cb, int il) { - struct ggml_tensor * tmp = up ? ggml_mul_mat(ctx, up, cur) : cur; + struct ggml_tensor * tmp = up ? llm_build_mm(lctx, ctx, up, cur) : cur; cb(tmp, "ffn_up", il); if (up_b) { @@ -7839,12 +7882,12 @@ static struct ggml_tensor * llm_build_ffn( switch (type_gate) { case LLM_FFN_SEQ: { - cur = ggml_mul_mat(ctx, gate, tmp); + cur = llm_build_mm(lctx, ctx, gate, tmp); cb(cur, "ffn_gate", il); } break; case LLM_FFN_PAR: { - cur = ggml_mul_mat(ctx, gate, cur); + cur = llm_build_mm(lctx, ctx, gate, cur); cb(cur, "ffn_gate", il); } break; } @@ -7899,7 +7942,7 @@ static struct ggml_tensor * llm_build_ffn( } if (down) { - cur = ggml_mul_mat(ctx, down, cur); + cur = llm_build_mm(lctx, ctx, down, cur); } if (down_b) { @@ -7919,6 +7962,7 @@ static struct ggml_tensor * llm_build_ffn( } static struct ggml_tensor * llm_build_moe_ffn( + struct llama_context & lctx, struct ggml_context * ctx, struct ggml_tensor * cur, struct ggml_tensor * gate_inp, @@ -7936,7 +7980,7 @@ static struct ggml_tensor * llm_build_moe_ffn( int64_t n_embd = cur->ne[0]; int64_t n_tokens = cur->ne[1]; - ggml_tensor * logits = ggml_mul_mat(ctx, gate_inp, cur); // [n_expert, n_tokens] + ggml_tensor * logits = llm_build_mm(lctx, ctx, gate_inp, cur); // [n_expert, n_tokens] cb(logits, "ffn_moe_logits", il); ggml_tensor * probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens] @@ -8019,6 +8063,7 @@ static struct ggml_tensor * llm_build_moe_ffn( } static struct ggml_tensor * llm_build_kqv( + struct llama_context & lctx, struct ggml_context * ctx, const llama_model & model, const llama_hparams & hparams, @@ -8076,7 +8121,7 @@ static struct ggml_tensor * llm_build_kqv( cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens); } else { - struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); + struct ggml_tensor * kq = llm_build_mm(lctx, ctx, k, q); cb(kq, "kq", il); if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) { @@ -8119,7 +8164,7 @@ static struct ggml_tensor * llm_build_kqv( 0); cb(v, "v", il); - struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq); + struct ggml_tensor * kqv = llm_build_mm(lctx, ctx, v, kq); cb(kqv, "kqv", il); struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3); @@ -8132,7 +8177,7 @@ static struct ggml_tensor * llm_build_kqv( ggml_build_forward_expand(graph, cur); if (wo) { - cur = ggml_mul_mat(ctx, wo, cur); + cur = llm_build_mm(lctx, ctx, wo, cur); } if (wo_b) { @@ -8147,6 +8192,7 @@ static struct ggml_tensor * llm_build_kqv( } static struct ggml_tensor * llm_build_kv( + struct llama_context & lctx, struct ggml_context * ctx, const llama_model & model, const llama_hparams & hparams, @@ -8176,7 +8222,7 @@ static struct ggml_tensor * llm_build_kv( struct ggml_tensor * cur; - cur = llm_build_kqv(ctx, model, hparams, cparams, kv, graph, wo, wo_b, + cur = llm_build_kqv(lctx, ctx, model, hparams, cparams, kv, graph, wo, wo_b, q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il); cb(cur, "kqv_out", il); @@ -8638,21 +8684,21 @@ struct llm_build_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_mm(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_mm(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -8673,7 +8719,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -8696,7 +8742,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -8710,7 +8756,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_moe_ffn(ctx0, cur, + cur = llm_build_moe_ffn(lctx, ctx0, cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -8740,7 +8786,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -8808,7 +8854,7 @@ struct llm_build_context { cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -8830,7 +8876,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -8913,7 +8959,7 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -8935,7 +8981,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -9034,7 +9080,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -9051,7 +9097,7 @@ struct llm_build_context { // feed forward { - cur = llm_build_ffn(ctx0, attn_norm, // !! use the attn norm, not the result + cur = llm_build_ffn(lctx, ctx0, attn_norm, // !! use the attn norm, not the result model.layers[il].ffn_up, NULL, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -9158,7 +9204,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); } @@ -9190,7 +9236,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_moe_ffn(ctx0, cur, + cur = llm_build_moe_ffn(lctx, ctx0, cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -9308,7 +9354,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -9331,7 +9377,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "attn_out_norm", il); - cur = llm_build_moe_ffn(ctx0, cur, + cur = llm_build_moe_ffn(lctx, ctx0, cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -9418,7 +9464,7 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -9442,7 +9488,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -9512,7 +9558,7 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); cb(Qcur, "Qcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -9534,7 +9580,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -9719,21 +9765,21 @@ struct llm_build_context { // feed-forward network if (model.arch == LLM_ARCH_BERT) { - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); } else if (model.arch == LLM_ARCH_JINA_BERT_V2) { - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, LLM_FFN_GELU, LLM_FFN_PAR, cb, il); } else { - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -9807,7 +9853,7 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -9831,7 +9877,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -9939,13 +9985,13 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } else { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -9969,7 +10015,7 @@ struct llm_build_context { model.layers[il].ffn_norm_b, LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -10090,7 +10136,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -10118,7 +10164,7 @@ struct llm_build_context { // parallel residual cur = inpSA; } - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -10209,7 +10255,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -10231,7 +10277,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -10323,7 +10369,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -10344,7 +10390,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -10438,7 +10484,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -10461,7 +10507,7 @@ struct llm_build_context { cb(cur, "ffn_norm", il); ggml_tensor * moe_out = - llm_build_moe_ffn(ctx0, cur, + llm_build_moe_ffn(lctx, ctx0, cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -10481,7 +10527,7 @@ struct llm_build_context { ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp); cb(cur_gate, "ffn_shexp_gate", il); - ggml_tensor * cur_ffn = llm_build_ffn(ctx0, cur, + ggml_tensor * cur_ffn = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up_shexp, NULL, NULL, model.layers[il].ffn_gate_shexp, NULL, NULL, model.layers[il].ffn_down_shexp, NULL, NULL, @@ -10595,7 +10641,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); } @@ -10610,7 +10656,7 @@ struct llm_build_context { // FF { - ffn_output = llm_build_ffn(ctx0, attn_norm_output, + ffn_output = llm_build_ffn(lctx, ctx0, attn_norm_output, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -10715,7 +10761,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); } @@ -10830,7 +10876,7 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -10848,7 +10894,7 @@ struct llm_build_context { // feed-forward network { - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -10932,7 +10978,7 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -10956,7 +11002,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -11043,7 +11089,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -11067,7 +11113,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -11163,7 +11209,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -11184,7 +11230,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -11281,7 +11327,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -11302,7 +11348,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -11412,7 +11458,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -11439,7 +11485,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -11534,7 +11580,7 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); } @@ -11556,7 +11602,7 @@ struct llm_build_context { // feed-forward network { - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -11647,7 +11693,7 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f, cb, il); } @@ -11674,7 +11720,7 @@ struct llm_build_context { // feed-forward network { - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -11784,7 +11830,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -11806,7 +11852,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -12077,7 +12123,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -12094,7 +12140,7 @@ struct llm_build_context { // feed-forward network { - cur = llm_build_ffn(ctx0, ffn_inp, + cur = llm_build_ffn(lctx, ctx0, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -12209,7 +12255,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, nullptr, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -12231,7 +12277,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -12336,7 +12382,7 @@ struct llm_build_context { Vcur = ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_kv, n_tokens); cb(Qcur, "Vcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -12358,7 +12404,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -12445,7 +12491,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -12470,7 +12516,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -12501,7 +12547,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -12588,7 +12634,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -12610,7 +12656,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -12627,7 +12673,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm_exps", il); - cur = llm_build_moe_ffn(ctx0, cur, + cur = llm_build_moe_ffn(lctx, ctx0, cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -12810,7 +12856,7 @@ struct llm_build_context { struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0); cb(k_states, "k_states", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); } @@ -12832,7 +12878,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -12847,7 +12893,7 @@ struct llm_build_context { cb(cur, "ffn_norm", il); ggml_tensor * moe_out = - llm_build_moe_ffn(ctx0, cur, + llm_build_moe_ffn(lctx, ctx0, cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -12860,7 +12906,7 @@ struct llm_build_context { // FFN shared expert { - ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, cur, + ggml_tensor * ffn_shexp = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up_shexp, NULL, NULL, model.layers[il].ffn_gate_shexp, NULL, NULL, model.layers[il].ffn_down_shexp, NULL, NULL, @@ -12965,7 +13011,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, NULL, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); @@ -12998,7 +13044,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_scale, model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_scale, NULL, NULL, NULL, @@ -13132,7 +13178,7 @@ struct llm_build_context { cb(cur, "ffn_norm", il); // T5 uses relu, flan-T5 uses gelu-gated - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up_enc, NULL, NULL, model.layers[il].ffn_gate_enc, NULL, NULL, model.layers[il].ffn_down_enc, NULL, NULL, @@ -13310,7 +13356,7 @@ struct llm_build_context { cb(cur, "ffn_norm", il); // T5 uses relu, flan-T5 uses gelu-gated - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -13392,7 +13438,7 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/float(n_embd_head), cb, il); } @@ -13416,7 +13462,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -18278,7 +18324,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } } -static int llama_lora_adapter_init_internal(const struct llama_model & model, const char * path_lora, struct llama_lora_adapter & adapter) { +static int llama_lora_adapter_init_internal(const struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) { static const int n_inp_tensors = 5; // see llama_model static const int n_out_tensors = 5; // see llama_model LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); @@ -18310,11 +18356,11 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co // count layer buffer types std::map buft_tensor_count; - for (int64_t i = 0; i < model.hparams.n_layer; i++) { - buft_tensor_count[model.buft_layer[i].buft] += n_tensors_per_layer; + for (int64_t i = 0; i < model->hparams.n_layer; i++) { + buft_tensor_count[model->buft_layer[i].buft] += n_tensors_per_layer; } - buft_tensor_count[model.buft_input.buft] += n_inp_tensors; - buft_tensor_count[model.buft_output.buft] += n_out_tensors; + buft_tensor_count[model->buft_input.buft] += n_inp_tensors; + buft_tensor_count[model->buft_output.buft] += n_out_tensors; // allocate contexts std::map ctx_map; @@ -18371,11 +18417,11 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co sscanf(cname, "blk.%d.", &il); struct ggml_context * dev_ctx; // device ctx if (il >= 0) { - dev_ctx = ctx_map.at(model.buft_layer[il].buft); + dev_ctx = ctx_map.at(model->buft_layer[il].buft); } else if (strstr(cname, "tok") == 0) { - dev_ctx = ctx_map.at(model.buft_input.buft); + dev_ctx = ctx_map.at(model->buft_input.buft); } else if (strstr(cname, "output") == 0) { - dev_ctx = ctx_map.at(model.buft_output.buft); + dev_ctx = ctx_map.at(model->buft_output.buft); } else { LLAMA_LOG_WARN("%s: discard tensor '%s'\n", __func__, cname); continue; @@ -18436,136 +18482,29 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co return 0; } -int32_t llama_lora_adapter_apply(struct llama_context * lctx, struct llama_lora_adapter * adapter) { - GGML_ASSERT(!lctx->lora_adapters.empty()); - const struct llama_model & model = lctx->model; - struct ggml_init_params ctx0_params = { - /*.mem_size =*/ lctx->buf_compute_meta.size(), - /*.mem_buffer =*/ lctx->buf_compute_meta.data(), - /*.no_alloc =*/ true, - }; - struct ggml_context * ctx0 = ggml_init(ctx0_params); - - // map "merged.%s" name to model tensor - std::map output_map; - // apply lora for model tensors - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); - auto apply_lora = [&](struct llama_lora_adapter * adapter, struct ggml_tensor * model_tensor) { - if (model_tensor == nullptr) { - return; - } - std::string name = ggml_get_name(model_tensor); - if (adapter->ab_map.find(name) != adapter->ab_map.end()) { - auto lora_w = adapter->ab_map[name]; - struct ggml_tensor * cur = ggml_mul_mat(ctx0, lora_w.a, lora_w.b); - cur = ggml_scale_inplace(ctx0, cur, adapter->scale); - cur = ggml_add(ctx0, cur, model_tensor); - ggml_format_name(cur, "merged.%s", name.c_str()); - ggml_build_forward_expand(gf, cur); - output_map[std::string(cur->name)] = model_tensor; - } - }; - apply_lora(adapter, model.tok_embd); - apply_lora(adapter, model.type_embd); - apply_lora(adapter, model.pos_embd); - apply_lora(adapter, model.tok_norm); - apply_lora(adapter, model.tok_norm_b); - apply_lora(adapter, model.output_norm); - apply_lora(adapter, model.output_norm_b); - apply_lora(adapter, model.output); - apply_lora(adapter, model.output_b); - apply_lora(adapter, model.output_norm_enc); - for (const llama_layer & layer : model.layers) { - apply_lora(adapter, layer.attn_norm); - apply_lora(adapter, layer.attn_norm_b); - apply_lora(adapter, layer.attn_norm_2); - apply_lora(adapter, layer.attn_norm_2_b); - apply_lora(adapter, layer.attn_q_norm); - apply_lora(adapter, layer.attn_q_norm_b); - apply_lora(adapter, layer.attn_k_norm); - apply_lora(adapter, layer.attn_k_norm_b); - apply_lora(adapter, layer.attn_out_norm); - apply_lora(adapter, layer.attn_out_norm_b); - apply_lora(adapter, layer.attn_q_a_norm); - apply_lora(adapter, layer.attn_kv_a_norm); - apply_lora(adapter, layer.attn_sub_norm); - apply_lora(adapter, layer.attn_post_norm); - apply_lora(adapter, layer.ffn_sub_norm); - apply_lora(adapter, layer.attn_norm_cross); - apply_lora(adapter, layer.attn_norm_enc); - - apply_lora(adapter, layer.wq); - apply_lora(adapter, layer.wk); - apply_lora(adapter, layer.wv); - apply_lora(adapter, layer.wo); - apply_lora(adapter, layer.wqkv); - apply_lora(adapter, layer.wq_a); - apply_lora(adapter, layer.wq_b); - apply_lora(adapter, layer.wkv_a_mqa); - apply_lora(adapter, layer.wkv_b); - - apply_lora(adapter, layer.bq); - apply_lora(adapter, layer.bk); - apply_lora(adapter, layer.bv); - apply_lora(adapter, layer.bo); - apply_lora(adapter, layer.bqkv); - - apply_lora(adapter, layer.ffn_norm); - apply_lora(adapter, layer.ffn_norm_b); - apply_lora(adapter, layer.ffn_post_norm); - apply_lora(adapter, layer.layer_out_norm); - apply_lora(adapter, layer.layer_out_norm_b); - apply_lora(adapter, layer.ffn_norm_exps); - apply_lora(adapter, layer.ffn_norm_enc); - - apply_lora(adapter, layer.ffn_gate); - apply_lora(adapter, layer.ffn_down); - apply_lora(adapter, layer.ffn_up); - apply_lora(adapter, layer.ffn_gate_enc); - apply_lora(adapter, layer.ffn_down_enc); - apply_lora(adapter, layer.ffn_up_enc); - - apply_lora(adapter, layer.ffn_gate_inp); - apply_lora(adapter, layer.ffn_gate_exps); - apply_lora(adapter, layer.ffn_down_exps); - apply_lora(adapter, layer.ffn_up_exps); - - apply_lora(adapter, layer.ffn_gate_inp_shexp); - apply_lora(adapter, layer.ffn_gate_shexp); - apply_lora(adapter, layer.ffn_down_shexp); - apply_lora(adapter, layer.ffn_up_shexp); - - apply_lora(adapter, layer.ffn_gate_b); - apply_lora(adapter, layer.ffn_down_b); - apply_lora(adapter, layer.ffn_up_b); - apply_lora(adapter, layer.ffn_act); - } - - // merge lora to model weight - ggml_status res = ggml_backend_sched_graph_compute(lctx->sched, gf); - if (res == GGML_STATUS_SUCCESS) { - // graph maybe realloc, we need to find correct gf->nodes based on name - size_t n_merged = 0; - for (int i = 0; i < gf->n_nodes; ++i) { - auto node = gf->nodes[i]; - std::string name(node->name); - if (output_map.find(name) != output_map.end()) { - struct ggml_tensor * model_tensor = output_map[name]; - ggml_backend_tensor_copy(node, model_tensor); - n_merged++; - } - } - GGML_ASSERT(n_merged == output_map.size()); - LLAMA_LOG_ERROR("%s: merged %ld lora weights to model\n", __func__, n_merged); - } else { - LLAMA_LOG_ERROR("%s: compute error while merging lora weights to model, result = %d\n", __func__, res); - return res; - } - - ggml_free(ctx0); +int32_t llama_lora_adapter_set( + struct llama_context * ctx, + struct llama_lora_adapter * adapter, + float scale) { + ctx->lora_adapters[adapter] = scale; return 0; } +int32_t llama_lora_adapter_remove( + struct llama_context * ctx, + struct llama_lora_adapter * adapter) { + auto pos = ctx->lora_adapters.find(adapter); + if (pos != ctx->lora_adapters.end()) { + ctx->lora_adapters.erase(pos); + return 0; + } + return -1; +} + +void llama_lora_adapter_free(struct llama_lora_adapter * adapter) { + delete adapter; +} + // // interface implementation // @@ -19343,17 +19282,11 @@ uint32_t llama_model_quantize( } } -struct llama_lora_adapter * llama_lora_adapter_init(struct llama_context * ctx, const char * path_lora, float scale) { +struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model, const char * path_lora) { try { - struct llama_lora_adapter * adapter = new llama_lora_adapter; - adapter->scale = scale; - int res = llama_lora_adapter_init_internal(ctx->model, path_lora, *adapter); - if (res == 0) { - ctx->lora_adapters.push_back(adapter); - return adapter; - } else { - return nullptr; - } + struct llama_lora_adapter * adapter = new llama_lora_adapter(model); + int res = llama_lora_adapter_init_internal(model, path_lora, *adapter); + return res == 0 ? adapter : nullptr; } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what()); return nullptr;