mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-03 17:51:09 +01:00
llama : model loading
This commit is contained in:
parent
d38e41ee69
commit
a3eefe95a8
51
llama.cpp
51
llama.cpp
@ -338,10 +338,14 @@ enum llm_tensor {
|
|||||||
LLM_TENSOR_ATTN_NORM,
|
LLM_TENSOR_ATTN_NORM,
|
||||||
LLM_TENSOR_ATTN_NORM_2,
|
LLM_TENSOR_ATTN_NORM_2,
|
||||||
LLM_TENSOR_ATTN_ROT_EMBD,
|
LLM_TENSOR_ATTN_ROT_EMBD,
|
||||||
|
LLM_TENSOR_FFN_GATE_INP,
|
||||||
|
LLM_TENSOR_FFN_NORM,
|
||||||
LLM_TENSOR_FFN_GATE,
|
LLM_TENSOR_FFN_GATE,
|
||||||
LLM_TENSOR_FFN_DOWN,
|
LLM_TENSOR_FFN_DOWN,
|
||||||
LLM_TENSOR_FFN_UP,
|
LLM_TENSOR_FFN_UP,
|
||||||
LLM_TENSOR_FFN_NORM,
|
LLM_TENSOR_FFN_DOWN_EXP,
|
||||||
|
LLM_TENSOR_FFN_GATE_EXP,
|
||||||
|
LLM_TENSOR_FFN_UP_EXP,
|
||||||
LLM_TENSOR_ATTN_Q_NORM,
|
LLM_TENSOR_ATTN_Q_NORM,
|
||||||
LLM_TENSOR_ATTN_K_NORM,
|
LLM_TENSOR_ATTN_K_NORM,
|
||||||
};
|
};
|
||||||
@ -360,10 +364,14 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|||||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
||||||
|
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -585,6 +593,10 @@ struct LLM_TN {
|
|||||||
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
|
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
|
||||||
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
|
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
|
||||||
|
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
//
|
//
|
||||||
@ -1268,6 +1280,12 @@ struct llama_layer {
|
|||||||
struct ggml_tensor * ffn_down; // w2
|
struct ggml_tensor * ffn_down; // w2
|
||||||
struct ggml_tensor * ffn_up; // w3
|
struct ggml_tensor * ffn_up; // w3
|
||||||
|
|
||||||
|
// ff MoE
|
||||||
|
struct ggml_tensor * ffn_gate_inp;
|
||||||
|
struct ggml_tensor * ffn_gate_exp[8];
|
||||||
|
struct ggml_tensor * ffn_down_exp[8];
|
||||||
|
struct ggml_tensor * ffn_up_exp[8];
|
||||||
|
|
||||||
// ff bias
|
// ff bias
|
||||||
struct ggml_tensor * ffn_down_b; // b2
|
struct ggml_tensor * ffn_down_b; // b2
|
||||||
struct ggml_tensor * ffn_up_b; // b3
|
struct ggml_tensor * ffn_up_b; // b3
|
||||||
@ -3025,9 +3043,20 @@ static void llm_load_tensors(
|
|||||||
|
|
||||||
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
||||||
|
|
||||||
layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
layer.ffn_gate_inp = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, backend, false);
|
||||||
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
|
||||||
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
if (layer.ffn_gate_inp == nullptr) {
|
||||||
|
layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
||||||
|
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
||||||
|
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
||||||
|
} else {
|
||||||
|
// MoE branch
|
||||||
|
for (int x = 0; x < 8; ++x) {
|
||||||
|
layer.ffn_gate_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}, backend_split);
|
||||||
|
layer.ffn_down_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd}, backend_split);
|
||||||
|
layer.ffn_up_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}, backend_split);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (backend == GGML_BACKEND_GPU) {
|
if (backend == GGML_BACKEND_GPU) {
|
||||||
vram_weights +=
|
vram_weights +=
|
||||||
@ -3037,8 +3066,18 @@ static void llm_load_tensors(
|
|||||||
(layer.bk ? ggml_nbytes(layer.bk) : 0) +
|
(layer.bk ? ggml_nbytes(layer.bk) : 0) +
|
||||||
(layer.bv ? ggml_nbytes(layer.bv) : 0) +
|
(layer.bv ? ggml_nbytes(layer.bv) : 0) +
|
||||||
(layer.bo ? ggml_nbytes(layer.bo) : 0) +
|
(layer.bo ? ggml_nbytes(layer.bo) : 0) +
|
||||||
ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_gate) +
|
ggml_nbytes(layer.ffn_norm);
|
||||||
ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
|
|
||||||
|
if (layer.ffn_gate_inp == nullptr) {
|
||||||
|
vram_weights +=
|
||||||
|
ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
|
||||||
|
} else {
|
||||||
|
vram_weights += ggml_nbytes(layer.ffn_gate_inp);
|
||||||
|
for (int x = 0; x < 8; ++x) {
|
||||||
|
vram_weights +=
|
||||||
|
ggml_nbytes(layer.ffn_gate_exp[x]) + ggml_nbytes(layer.ffn_down_exp[x]) + ggml_nbytes(layer.ffn_up_exp[x]);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
Loading…
Reference in New Issue
Block a user