mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-26 12:21:40 +01:00
Remove custom pre attention scaling and use computed value instead.
This commit is contained in:
parent
a89427908d
commit
51f0bd50a1
@ -2369,9 +2369,6 @@ class Gemma2Model(Model):
|
|||||||
self.gguf_writer.add_final_logit_softcapping(
|
self.gguf_writer.add_final_logit_softcapping(
|
||||||
self.hparams["final_logit_softcapping"]
|
self.hparams["final_logit_softcapping"]
|
||||||
)
|
)
|
||||||
self.gguf_writer.add_query_pre_attn_scalar(
|
|
||||||
self.hparams["query_pre_attn_scalar"]
|
|
||||||
)
|
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
del bid # unusem
|
del bid # unusem
|
||||||
|
@ -52,7 +52,6 @@ class Keys:
|
|||||||
DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
|
DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
|
||||||
ATTN_LOGIT_SOFTCAPPING = "{arch}.attn_logit_softcapping"
|
ATTN_LOGIT_SOFTCAPPING = "{arch}.attn_logit_softcapping"
|
||||||
FINAL_LOGIT_SOFTCAPPING = "{arch}.final_logit_softcapping"
|
FINAL_LOGIT_SOFTCAPPING = "{arch}.final_logit_softcapping"
|
||||||
QUERY_PRE_ATTN_SCALAR = "{arch}.query_pre_attn_scalar"
|
|
||||||
|
|
||||||
class Attention:
|
class Attention:
|
||||||
HEAD_COUNT = "{arch}.attention.head_count"
|
HEAD_COUNT = "{arch}.attention.head_count"
|
||||||
|
@ -522,9 +522,6 @@ class GGUFWriter:
|
|||||||
def add_final_logit_softcapping(self, value: float) -> None:
|
def add_final_logit_softcapping(self, value: float) -> None:
|
||||||
self.add_float32(Keys.LLM.FINAL_LOGIT_SOFTCAPPING.format(arch=self.arch), value)
|
self.add_float32(Keys.LLM.FINAL_LOGIT_SOFTCAPPING.format(arch=self.arch), value)
|
||||||
|
|
||||||
def add_query_pre_attn_scalar(self, value: float) -> None:
|
|
||||||
self.add_float32(Keys.LLM.QUERY_PRE_ATTN_SCALAR.format(arch=self.arch), value)
|
|
||||||
|
|
||||||
def add_expert_count(self, count: int) -> None:
|
def add_expert_count(self, count: int) -> None:
|
||||||
self.add_uint32(Keys.LLM.EXPERT_COUNT.format(arch=self.arch), count)
|
self.add_uint32(Keys.LLM.EXPERT_COUNT.format(arch=self.arch), count)
|
||||||
|
|
||||||
|
@ -304,7 +304,6 @@ enum llm_kv {
|
|||||||
LLM_KV_DECODER_START_TOKEN_ID,
|
LLM_KV_DECODER_START_TOKEN_ID,
|
||||||
LLM_KV_ATTN_LOGIT_SOFTCAPPING,
|
LLM_KV_ATTN_LOGIT_SOFTCAPPING,
|
||||||
LLM_KV_FINAL_LOGIT_SOFTCAPPING,
|
LLM_KV_FINAL_LOGIT_SOFTCAPPING,
|
||||||
LLM_KV_QUERY_PRE_ATTN_SCALAR,
|
|
||||||
|
|
||||||
LLM_KV_ATTENTION_HEAD_COUNT,
|
LLM_KV_ATTENTION_HEAD_COUNT,
|
||||||
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
||||||
@ -397,7 +396,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|||||||
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
||||||
{ LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
|
{ LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
|
||||||
{ LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
|
{ LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
|
||||||
{ LLM_KV_QUERY_PRE_ATTN_SCALAR, "%s.query_pre_attn_scalar" },
|
|
||||||
|
|
||||||
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
||||||
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
||||||
@ -2107,7 +2105,6 @@ struct llama_hparams {
|
|||||||
|
|
||||||
float f_attn_logit_softcapping = 50.0f;
|
float f_attn_logit_softcapping = 50.0f;
|
||||||
float f_final_logit_softcapping = 30.0f;
|
float f_final_logit_softcapping = 30.0f;
|
||||||
float f_query_pre_attn_scalar = 144.0f;
|
|
||||||
|
|
||||||
float rope_attn_factor = 1.0f;
|
float rope_attn_factor = 1.0f;
|
||||||
float rope_freq_base_train;
|
float rope_freq_base_train;
|
||||||
@ -4715,7 +4712,6 @@ static void llm_load_hparams(
|
|||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
|
ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
|
||||||
ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
|
ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
|
||||||
ml.get_key(LLM_KV_QUERY_PRE_ATTN_SCALAR, hparams.f_query_pre_attn_scalar, false);
|
|
||||||
hparams.attn_soft_cap = true;
|
hparams.attn_soft_cap = true;
|
||||||
|
|
||||||
switch (hparams.n_layer) {
|
switch (hparams.n_layer) {
|
||||||
@ -10952,7 +10948,7 @@ struct llm_build_context {
|
|||||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(hparams.f_query_pre_attn_scalar));
|
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
|
||||||
cb(Qcur, "Qcur_scaled", il);
|
cb(Qcur, "Qcur_scaled", il);
|
||||||
|
|
||||||
Kcur = ggml_rope_ext(
|
Kcur = ggml_rope_ext(
|
||||||
@ -11060,7 +11056,7 @@ struct llm_build_context {
|
|||||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
|
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head)));
|
||||||
cb(Qcur, "Qcur_scaled", il);
|
cb(Qcur, "Qcur_scaled", il);
|
||||||
|
|
||||||
Kcur = ggml_rope_ext(
|
Kcur = ggml_rope_ext(
|
||||||
|
Loading…
Reference in New Issue
Block a user