mpt : add optional bias tensors (#5638)

Update for MPT with optional bias parameters: to work with PhoGPT and SEA-LION models that were pre-trained with 'bias'.
This commit is contained in:
Dat Quoc Nguyen 2024-02-22 18:15:13 +10:00 committed by GitHub
parent 973053d8b0
commit 4ef245a92a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -4054,6 +4054,8 @@ static bool llm_load_tensors(
// output // output
{ {
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
} }
@ -4063,14 +4065,23 @@ static bool llm_load_tensors(
auto & layer = model.layers[i]; auto & layer = model.layers[i];
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, false);
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false);
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false);
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, false);
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false);
// AWQ ScaleActivation layer // AWQ ScaleActivation layer
layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false); layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
@ -6171,7 +6182,7 @@ struct llm_build_context {
attn_norm = llm_build_norm(ctx0, inpL, hparams, attn_norm = llm_build_norm(ctx0, inpL, hparams,
model.layers[il].attn_norm, model.layers[il].attn_norm,
NULL, model.layers[il].attn_norm_b,
LLM_NORM, cb, il); LLM_NORM, cb, il);
cb(attn_norm, "attn_norm", il); cb(attn_norm, "attn_norm", il);
@ -6182,6 +6193,11 @@ struct llm_build_context {
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
cb(cur, "wqkv", il); cb(cur, "wqkv", il);
if (model.layers[il].bqkv){
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
cb(cur, "bqkv", il);
}
if (hparams.f_clamp_kqv > 0.0f) { if (hparams.f_clamp_kqv > 0.0f) {
cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
cb(cur, "wqkv_clamped", il); cb(cur, "wqkv_clamped", il);
@ -6198,7 +6214,7 @@ struct llm_build_context {
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, NULL, model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il); cb(cur, "kqv_out", il);
} }
@ -6211,13 +6227,13 @@ struct llm_build_context {
{ {
cur = llm_build_norm(ctx0, ffn_inp, hparams, cur = llm_build_norm(ctx0, ffn_inp, hparams,
model.layers[il].ffn_norm, model.layers[il].ffn_norm,
NULL, model.layers[il].ffn_norm_b,
LLM_NORM, cb, il); LLM_NORM, cb, il);
cb(cur, "ffn_norm", il); cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur, cur = llm_build_ffn(ctx0, cur,
model.layers[il].ffn_up, NULL, model.layers[il].ffn_up, model.layers[il].ffn_up_b,
NULL, NULL, NULL, NULL,
model.layers[il].ffn_down, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b,
model.layers[il].ffn_act, model.layers[il].ffn_act,
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
cb(cur, "ffn_out", il); cb(cur, "ffn_out", il);
@ -6234,7 +6250,7 @@ struct llm_build_context {
cur = llm_build_norm(ctx0, cur, hparams, cur = llm_build_norm(ctx0, cur, hparams,
model.output_norm, model.output_norm,
NULL, model.output_norm_b,
LLM_NORM, cb, -1); LLM_NORM, cb, -1);
cb(cur, "result_norm", -1); cb(cur, "result_norm", -1);