gguf : add rope_freq_base parameter for CodeLlama (#2769)

2024-12-25 05:48:47 +01:00 · 2023-08-24 20:04:05 +02:00 · 2023-08-24 20:04:05 +02:00 · 0d3094f0c7
commit 0d3094f0c7
parent 01f2224682
3 changed files with 48 additions and 21 deletions
--- a/convert.py
+++ b/convert.py
@ -104,6 +104,8 @@ class Params:
    n_head_kv:  int
    f_norm_eps: float
    f_rope_freq_base: Optional[float] = None
    ftype: Optional[GGMLFileType] = None
    # path to the directory containing the model files
@ -203,6 +205,7 @@ class Params:
        n_head           = config["n_heads"]
        n_head_kv        = config["n_kv_heads"] if "n_kv_heads" in config else n_head
        f_norm_eps       = config["norm_eps"]
        f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None
        if n_vocab == -1:
            n_vocab = model["tok_embeddings.weight"].shape[0]
@ -220,6 +223,7 @@ class Params:
            n_head           = n_head,
            n_head_kv        = n_head_kv,
            f_norm_eps       = f_norm_eps,
            f_rope_freq_base = f_rope_freq_base,
        )
    @staticmethod
@ -754,6 +758,9 @@ class OutputFile:
        self.gguf.add_head_count_kv       (params.n_head_kv)
        self.gguf.add_layer_norm_rms_eps  (params.f_norm_eps)
        if params.f_rope_freq_base:
            self.gguf.add_rope_freq_base(params.f_rope_freq_base)
        if params.ftype:
            self.gguf.add_file_type(params.ftype)
--- a/gguf.py
+++ b/gguf.py
@ -47,6 +47,7 @@ KEY_ATTENTION_LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
 # RoPE
 KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count"
 KEY_ROPE_FREQ_BASE       = "{arch}.rope.freq_base"
 KEY_ROPE_SCALE_LINEAR    = "{arch}.rope.scale_linear"
 # tokenization
@ -663,6 +664,9 @@ class GGUFWriter:
        self.add_uint32(
            KEY_ROPE_DIMENSION_COUNT.format(arch=self.arch), count)
    def add_rope_freq_base(self, value: float):
        self.add_float32(KEY_ROPE_FREQ_BASE.format(arch=self.arch), value)
    def add_rope_scale_linear(self, value: float):
        self.add_float32(KEY_ROPE_SCALE_LINEAR.format(arch=self.arch), value)
--- a/llama.cpp
+++ b/llama.cpp
@ -195,6 +195,7 @@ enum llm_kv {
    LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
    LLM_KV_ROPE_DIMENSION_COUNT,
    LLM_KV_ROPE_FREQ_BASE,
    LLM_KV_ROPE_SCALE_LINEAR,
    LLM_KV_TOKENIZER_MODEL,
@ -238,6 +239,7 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
    { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,   "%s.attention.layer_norm_rms_epsilon" },
    { LLM_KV_ROPE_DIMENSION_COUNT,          "%s.rope.dimension_count" },
    { LLM_KV_ROPE_FREQ_BASE,                "%s.rope.freq_base"       },
    { LLM_KV_ROPE_SCALE_LINEAR,             "%s.rope.scale_linear"    },
    { LLM_KV_TOKENIZER_MODEL,               "tokenizer.ggml.model"              },
@ -1561,12 +1563,26 @@ static void llm_load_hparams(
    hparams.n_head_kv = hparams.n_head;
    GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
-    // TODO: manually setting rope scale should override this
+    // TODO: manually setting rope freq base and scale should override this
    // FIXME: partial fix when the param specified is not the default value, but
    //        will not work for overriding the model value to the params default
    llama_context_params defaults = llama_context_default_params();
    // rope_freq_base
    {
        float ropebase = 10000.0f;
        GGUF_GET_KEY(ctx, ropebase, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
        if (ropebase != 10000.0f && rope_freq_base == defaults.rope_freq_base) {
            rope_freq_base = ropebase;
        }
    }
    // rope_freq_scale (inverse of the kv) is optional
    {
        float ropescale = 1.0f;
        GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
-        if (ropescale != 1.0f) {
+        if (ropescale != 1.0f && rope_freq_scale == defaults.rope_freq_scale) {
            rope_freq_scale = 1.0f/ropescale;
        }
    }