llama : model loading

This commit is contained in:
Georgi Gerganov 2023-12-09 11:14:03 +02:00
parent d38e41ee69
commit a3eefe95a8
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735

View File

@ -338,10 +338,14 @@ enum llm_tensor {
LLM_TENSOR_ATTN_NORM,
LLM_TENSOR_ATTN_NORM_2,
LLM_TENSOR_ATTN_ROT_EMBD,
LLM_TENSOR_FFN_GATE_INP,
LLM_TENSOR_FFN_NORM,
LLM_TENSOR_FFN_GATE,
LLM_TENSOR_FFN_DOWN,
LLM_TENSOR_FFN_UP,
LLM_TENSOR_FFN_NORM,
LLM_TENSOR_FFN_DOWN_EXP,
LLM_TENSOR_FFN_GATE_EXP,
LLM_TENSOR_FFN_UP_EXP,
LLM_TENSOR_ATTN_Q_NORM,
LLM_TENSOR_ATTN_K_NORM,
};
@ -360,10 +364,14 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
},
},
{
@ -585,6 +593,10 @@ struct LLM_TN {
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
}
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix;
}
};
//
@ -1268,6 +1280,12 @@ struct llama_layer {
struct ggml_tensor * ffn_down; // w2
struct ggml_tensor * ffn_up; // w3
// ff MoE
struct ggml_tensor * ffn_gate_inp;
struct ggml_tensor * ffn_gate_exp[8];
struct ggml_tensor * ffn_down_exp[8];
struct ggml_tensor * ffn_up_exp[8];
// ff bias
struct ggml_tensor * ffn_down_b; // b2
struct ggml_tensor * ffn_up_b; // b3
@ -3025,9 +3043,20 @@ static void llm_load_tensors(
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
layer.ffn_gate_inp = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, backend, false);
if (layer.ffn_gate_inp == nullptr) {
layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
} else {
// MoE branch
for (int x = 0; x < 8; ++x) {
layer.ffn_gate_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}, backend_split);
layer.ffn_down_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd}, backend_split);
layer.ffn_up_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}, backend_split);
}
}
if (backend == GGML_BACKEND_GPU) {
vram_weights +=
@ -3037,8 +3066,18 @@ static void llm_load_tensors(
(layer.bk ? ggml_nbytes(layer.bk) : 0) +
(layer.bv ? ggml_nbytes(layer.bv) : 0) +
(layer.bo ? ggml_nbytes(layer.bo) : 0) +
ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_gate) +
ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
ggml_nbytes(layer.ffn_norm);
if (layer.ffn_gate_inp == nullptr) {
vram_weights +=
ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
} else {
vram_weights += ggml_nbytes(layer.ffn_gate_inp);
for (int x = 0; x < 8; ++x) {
vram_weights +=
ggml_nbytes(layer.ffn_gate_exp[x]) + ggml_nbytes(layer.ffn_down_exp[x]) + ggml_nbytes(layer.ffn_up_exp[x]);
}
}
}
}
} break;