mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-28 04:47:04 +01:00
Initial OpenELM support (270M only so far)
This commit is contained in:
parent
c1b295eea5
commit
217d8d7b77
@ -2395,6 +2395,50 @@ class JinaBertV2Model(BertModel):
|
||||
self.gguf_writer.add_add_eos_token(True)
|
||||
|
||||
|
||||
@Model.register("OpenELMForCausalLM")
|
||||
class OpenELMModel(Model):
|
||||
model_arch = gguf.MODEL_ARCH.OPENELM
|
||||
|
||||
# Copied from LlamaModel
|
||||
def set_vocab(self):
|
||||
try:
|
||||
self. _set_vocab_sentencepiece()
|
||||
except FileNotFoundError:
|
||||
self._set_vocab_llama_hf()
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
# TODO: Look closer at these
|
||||
|
||||
self.gguf_writer.add_name("OpenELM")
|
||||
self.block_count = self.find_hparam(["num_transformer_layers"])
|
||||
self.gguf_writer.add_layer_norm_eps(1e-5)
|
||||
# https://huggingface.co/apple/OpenELM-270M-Instruct/blob/c401df2/modeling_openelm.py#L30
|
||||
self.gguf_writer.add_layer_norm_rms_eps(1e-6)
|
||||
n_embd = self.find_hparam(["model_dim"])
|
||||
self.gguf_writer.add_embedding_length(n_embd)
|
||||
head_dim = self.find_hparam(["head_dim"])
|
||||
n_head = n_embd // head_dim
|
||||
rot_pct = 1.0
|
||||
self.gguf_writer.add_context_length(self.find_hparam(["max_context_length"]))
|
||||
self.gguf_writer.add_block_count(self.block_count)
|
||||
self.gguf_writer.add_head_count_kv(n_head*10)
|
||||
self.gguf_writer.add_head_count(n_head*10)
|
||||
self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
self.gguf_writer.add_feed_forward_length(0) # dynamically calculated
|
||||
|
||||
def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any:
|
||||
# TODO: Read configuration!
|
||||
if "n_layers" in keys:
|
||||
return 16 # num_transformer_layers
|
||||
if "hidden_size" in keys:
|
||||
return 1280 # model_dim
|
||||
if "num_attention_heads" in keys:
|
||||
return 64 # head_dim
|
||||
|
||||
return super().find_hparam(keys, optional)
|
||||
|
||||
|
||||
###### CONVERSION LOGIC ######
|
||||
|
||||
|
||||
|
@ -139,6 +139,7 @@ class MODEL_ARCH(IntEnum):
|
||||
COMMAND_R = auto()
|
||||
DBRX = auto()
|
||||
OLMO = auto()
|
||||
OPENELM = auto()
|
||||
|
||||
|
||||
class MODEL_TENSOR(IntEnum):
|
||||
@ -217,6 +218,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.COMMAND_R: "command-r",
|
||||
MODEL_ARCH.DBRX: "dbrx",
|
||||
MODEL_ARCH.OLMO: "olmo",
|
||||
MODEL_ARCH.OPENELM: "openelm",
|
||||
}
|
||||
|
||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
@ -743,6 +745,18 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.OPENELM: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_QKV,
|
||||
MODEL_TENSOR.ATTN_Q_NORM,
|
||||
MODEL_TENSOR.ATTN_K_NORM,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
],
|
||||
# TODO
|
||||
}
|
||||
|
||||
|
@ -24,6 +24,7 @@ class TensorNameMap:
|
||||
"backbone.embedding", # mamba
|
||||
"backbone.embeddings", # mamba-hf
|
||||
"transformer.in_out_embed", # Grok
|
||||
"transformer.token_embeddings", # openelm
|
||||
),
|
||||
|
||||
# Token type embeddings
|
||||
@ -36,6 +37,7 @@ class TensorNameMap:
|
||||
"word_embeddings_layernorm", # bloom
|
||||
"embeddings.LayerNorm", # bert
|
||||
"emb_ln", # nomic-bert
|
||||
"transformer.norm", # openelm
|
||||
),
|
||||
|
||||
# Position embeddings
|
||||
@ -68,6 +70,7 @@ class TensorNameMap:
|
||||
"model.norm_f", # mamba-qbert
|
||||
"backbone.norm_f", # mamba
|
||||
"transformer.rms_norm", # Grok
|
||||
"transformer.norm", # openelm
|
||||
),
|
||||
|
||||
# Rope frequencies
|
||||
@ -97,6 +100,7 @@ class TensorNameMap:
|
||||
"backbone.layers.{bid}.norm", # mamba
|
||||
"transformer.decoder_layer.{bid}.rms_norm", # Grok
|
||||
"transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx
|
||||
"transformer.layers.{bid}.attn_norm", # openelm
|
||||
),
|
||||
|
||||
# Attention norm 2
|
||||
@ -117,7 +121,8 @@ class TensorNameMap:
|
||||
"h.{bid}.attn.c_attn", # gpt2
|
||||
"transformer.h.{bid}.mixer.Wqkv", # phi2
|
||||
"encoder.layers.{bid}.attn.Wqkv", # nomic-bert
|
||||
"model.layers.{bid}.self_attn.qkv_proj" # phi3
|
||||
"model.layers.{bid}.self_attn.qkv_proj", # phi3
|
||||
"transformer.layers.{bid}.attn.qkv_proj", # openelm
|
||||
),
|
||||
|
||||
# Attention query
|
||||
@ -175,6 +180,7 @@ class TensorNameMap:
|
||||
"encoder.layers.{bid}.attn.out_proj", # nomic-bert
|
||||
"transformer.decoder_layer.{bid}.multi_head_attention.linear", # Grok
|
||||
"transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", # dbrx
|
||||
"transformer.layers.{bid}.attn.out_proj", # openelm
|
||||
),
|
||||
|
||||
# Attention output norm
|
||||
@ -206,6 +212,7 @@ class TensorNameMap:
|
||||
"h.{bid}.ln_2", # gpt2
|
||||
"model.layers.{bid}.ffn_norm", # internlm2
|
||||
"transformer.decoder_layer.{bid}.rms_norm_2", # Grok
|
||||
"transformer.layers.{bid}.ffn_norm", # openelm
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_GATE_INP: (
|
||||
@ -244,6 +251,7 @@ class TensorNameMap:
|
||||
"encoder.layers.{bid}.mlp.fc11", # nomic-bert
|
||||
"model.layers.{bid}.mlp.c_fc", # starcoder2
|
||||
"encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2
|
||||
"transformer.layers.{bid}.ffn.proj_1", # openelm
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_UP_EXP: (
|
||||
@ -306,6 +314,7 @@ class TensorNameMap:
|
||||
"encoder.layers.{bid}.mlp.fc2", # nomic-bert
|
||||
"model.layers.{bid}.mlp.c_proj", # starcoder2
|
||||
"encoder.layer.{bid}.mlp.wo", # jina-bert-v2
|
||||
"transformer.layers.{bid}.ffn.proj_2", # openelm
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_DOWN_EXP: (
|
||||
@ -324,7 +333,8 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.self_attn.q_layernorm", # persimmon
|
||||
"model.layers.{bid}.self_attn.q_norm", # cohere
|
||||
"transformer.blocks.{bid}.attn.q_ln", # sea-lion
|
||||
"encoder.layer.{bid}.attention.self.layer_norm_q" # jina-bert-v2
|
||||
"encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2
|
||||
"transformer.layers.{bid}.attn.q_norm", # openelm
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ATTN_K_NORM: (
|
||||
@ -332,7 +342,8 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.self_attn.k_layernorm", # persimmon
|
||||
"model.layers.{bid}.self_attn.k_norm", # cohere
|
||||
"transformer.blocks.{bid}.attn.k_ln", # sea-lion
|
||||
"encoder.layer.{bid}.attention.self.layer_norm_k" # jina-bert-v2
|
||||
"encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2
|
||||
"transformer.layers.{bid}.attn.k_norm", # openelm
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ROPE_FREQS: (
|
||||
|
250
llama.cpp
250
llama.cpp
@ -229,6 +229,7 @@ enum llm_arch {
|
||||
LLM_ARCH_COMMAND_R,
|
||||
LLM_ARCH_DBRX,
|
||||
LLM_ARCH_OLMO,
|
||||
LLM_ARCH_OPENELM,
|
||||
LLM_ARCH_UNKNOWN,
|
||||
};
|
||||
|
||||
@ -266,6 +267,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_COMMAND_R, "command-r" },
|
||||
{ LLM_ARCH_DBRX, "dbrx" },
|
||||
{ LLM_ARCH_OLMO, "olmo" },
|
||||
{ LLM_ARCH_OPENELM, "openelm" },
|
||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||
};
|
||||
|
||||
@ -1052,6 +1054,21 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_OPENELM,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
||||
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
||||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_UNKNOWN,
|
||||
{
|
||||
@ -1715,7 +1732,9 @@ enum e_model {
|
||||
MODEL_33M,
|
||||
MODEL_109M,
|
||||
MODEL_137M,
|
||||
MODEL_270M,
|
||||
MODEL_335M,
|
||||
MODEL_450M,
|
||||
MODEL_0_5B,
|
||||
MODEL_1B,
|
||||
MODEL_2B,
|
||||
@ -4261,6 +4280,18 @@ static void llm_load_hparams(
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_OPENELM:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 16: model.type = e_model::MODEL_270M; break;
|
||||
case 20: model.type = e_model::MODEL_450M; break;
|
||||
case 28: model.type = e_model::MODEL_1B; break;
|
||||
case 36: model.type = e_model::MODEL_3B; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
default: (void)0;
|
||||
}
|
||||
|
||||
@ -4767,6 +4798,24 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||
if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
|
||||
}
|
||||
|
||||
static int make_divisible(
|
||||
double v,
|
||||
int divisor = 8,
|
||||
double min_value = 0
|
||||
) {
|
||||
if (min_value == 0) {
|
||||
min_value = divisor;
|
||||
}
|
||||
int new_v = int(v + divisor / 2.0) / divisor * divisor;
|
||||
if (new_v < min_value) {
|
||||
new_v = min_value;
|
||||
}
|
||||
if (new_v < 0.9 * v) {
|
||||
new_v += divisor;
|
||||
}
|
||||
return new_v;
|
||||
}
|
||||
|
||||
// Returns false if cancelled by progress_callback
|
||||
static bool llm_load_tensors(
|
||||
llama_model_loader & ml,
|
||||
@ -6060,6 +6109,52 @@ static bool llm_load_tensors(
|
||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_OPENELM:
|
||||
{
|
||||
{
|
||||
{
|
||||
std::vector<int> num_kv_heads = {3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5};
|
||||
std::vector<int> num_query_heads = {12, 12, 12, 12, 12, 16, 16, 16, 16, 16, 16, 16, 20, 20, 20, 20};
|
||||
std::vector<float> ffn_multipliers = {0.5, 0.73, 0.97, 1.2, 1.43, 1.67, 1.9, 2.13, 2.37, 2.6, 2.83, 3.07, 3.3, 3.53, 3.77, 4.0};
|
||||
|
||||
llama_hparams modified_hparams(hparams);
|
||||
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab });
|
||||
|
||||
{
|
||||
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd });
|
||||
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||
ml.n_created--; // artificial tensor
|
||||
ml.size_data += ggml_nbytes(model.output);
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
const int64_t n_head_k = num_kv_heads[i];
|
||||
const int64_t n_head_v = num_kv_heads[i];
|
||||
const int64_t n_head_kv = n_head_k + n_head_v;
|
||||
const int64_t n_head = n_head_kv + num_query_heads[i];
|
||||
modified_hparams.n_head = n_head;
|
||||
modified_hparams.n_embd_head_v = 64;
|
||||
modified_hparams.n_embd_head_k = 64;
|
||||
int64_t n_embd_head = modified_hparams.n_embd_head_v;
|
||||
|
||||
modified_hparams.n_head_kv = n_head_kv;
|
||||
const int64_t ffn_inter = make_divisible(n_embd*ffn_multipliers[i], 256);
|
||||
|
||||
ggml_context* ctx_layer = ctx_for_layer(i);
|
||||
ggml_context* ctx_split = ctx_for_layer_split(i);
|
||||
auto& layer = model.layers[i];
|
||||
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd });
|
||||
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd_head*n_head });
|
||||
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head });
|
||||
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head });
|
||||
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head_kv*n_embd_head*2, n_embd });
|
||||
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd });
|
||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * ffn_inter });
|
||||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { ffn_inter, n_embd });
|
||||
}
|
||||
}
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
throw std::runtime_error("unknown architecture");
|
||||
}
|
||||
@ -10780,6 +10875,156 @@ struct llm_build_context {
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_openelm() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
const int64_t n_embd_head = 64;
|
||||
// TODO: get this from config
|
||||
std::vector<int> num_kv_heads = {3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5};
|
||||
std::vector<int> num_query_heads = {12, 12, 12, 12, 12, 16, 16, 16, 16, 16, 16, 16, 20, 20, 20, 20};
|
||||
|
||||
llama_hparams modified_hparams(hparams);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
const int64_t n_head_q = num_query_heads[il];
|
||||
const int64_t n_head_k = num_kv_heads[il];
|
||||
const int64_t n_head_v = num_kv_heads[il];
|
||||
const int64_t n_head_kv = n_head_k + n_head_v;
|
||||
const int64_t n_head = n_head_kv + n_head_q;
|
||||
|
||||
modified_hparams.n_head = n_head_q;
|
||||
modified_hparams.n_head_kv = n_head_k;
|
||||
modified_hparams.n_embd_head_k = 64;
|
||||
modified_hparams.n_embd_head_v = 64;
|
||||
|
||||
cur = inpL;
|
||||
struct ggml_tensor * residual = cur;
|
||||
|
||||
// norm
|
||||
cur = llm_build_norm(ctx0, inpL, modified_hparams,
|
||||
model.layers[il].attn_norm, NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
// self-attention
|
||||
{
|
||||
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
||||
cb(cur, "wqkv", il);
|
||||
|
||||
cur = ggml_reshape_3d(ctx0, cur, n_embd_head, n_head, n_tokens);
|
||||
|
||||
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_q, n_tokens, cur->nb[1], cur->nb[2], 0));
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_k, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head_q));
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_v, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head_q+n_head_k)));
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Qcur = llm_build_norm(ctx0, Qcur, modified_hparams,
|
||||
model.layers[il].attn_q_norm, NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = llm_build_norm(ctx0, Kcur, modified_hparams,
|
||||
model.layers[il].attn_k_norm, NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
Qcur = ggml_rope_custom(
|
||||
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_rope_custom(
|
||||
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
Vcur = ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_v, n_tokens);
|
||||
cb(Qcur, "Vcur", il);
|
||||
|
||||
cur = llm_build_kv(ctx0, model, modified_hparams, cparams, kv_self, gf,
|
||||
model.layers[il].wo, NULL,
|
||||
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
}
|
||||
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
// feed-forward network
|
||||
{
|
||||
cur = llm_build_norm(ctx0, ffn_inp, modified_hparams,
|
||||
model.layers[il].ffn_norm, NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
// TODO: Split ffn_up during conversion?
|
||||
struct ggml_tensor * ffn_gate =
|
||||
ggml_view_2d(ctx0,
|
||||
model.layers[il].ffn_up,
|
||||
model.layers[il].ffn_down->ne[1],
|
||||
model.layers[il].ffn_down->ne[0],
|
||||
model.layers[il].ffn_up->nb[1],
|
||||
0);
|
||||
|
||||
struct ggml_tensor * ffn_up =
|
||||
ggml_view_2d(ctx0,
|
||||
model.layers[il].ffn_up,
|
||||
model.layers[il].ffn_down->ne[1],
|
||||
model.layers[il].ffn_down->ne[0],
|
||||
model.layers[il].ffn_up->nb[1],
|
||||
model.layers[il].ffn_up->nb[1] * model.layers[il].ffn_down->ne[0]);
|
||||
|
||||
cur = llm_build_ffn(ctx0, cur,
|
||||
ffn_up, NULL,
|
||||
ffn_gate, NULL,
|
||||
model.layers[il].ffn_down, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = inpL;
|
||||
|
||||
// norm
|
||||
cur = llm_build_norm(ctx0, cur, hparams,
|
||||
model.output_norm, NULL,
|
||||
LLM_NORM_RMS, cb, -1);
|
||||
cb(cur, "result_norm", -1);
|
||||
|
||||
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
};
|
||||
|
||||
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
||||
@ -10994,6 +11239,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||
{
|
||||
result = llm.build_olmo();
|
||||
} break;
|
||||
case LLM_ARCH_OPENELM:
|
||||
{
|
||||
result = llm.build_openelm();
|
||||
} break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
@ -16036,6 +16285,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||
case LLM_ARCH_PHI3:
|
||||
case LLM_ARCH_GEMMA:
|
||||
case LLM_ARCH_STARCODER2:
|
||||
case LLM_ARCH_OPENELM:
|
||||
return LLAMA_ROPE_TYPE_NEOX;
|
||||
|
||||
// all model arches should be listed explicitly here
|
||||
|
Loading…
Reference in New Issue
Block a user