convert : use 1e6 rope_freq_base for mixtral

2025-01-30 13:53:03 +01:00 · 2023-12-11 20:00:28 +01:00 · 2023-12-11 20:00:28 +01:00 · 7dc75e3923
commit 7dc75e3923
parent 296c945de5
1 changed files with 4 additions and 1 deletions
--- a/convert.py
+++ b/convert.py
@ -259,6 +259,7 @@ class Params:

        n_experts = None
        n_experts_used = None
+        f_rope_freq_base = None

        # hack to determine LLaMA v1 vs v2 vs CodeLlama
        if config.get("moe"):
@ -281,6 +282,8 @@ class Params:
            n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0]
            n_experts = config["moe"]["num_experts"]
            n_experts_used = config["moe"]["num_experts_per_tok"]
+            f_rope_freq_base = 1e6
+

        return Params(
            n_vocab          = model["tok_embeddings.weight"].shape[0],
@ -293,7 +296,7 @@ class Params:
            n_experts        = n_experts,
            n_experts_used   = n_experts_used,
            f_norm_eps       = config["norm_eps"],
-            f_rope_freq_base = config.get("rope_theta"),
+            f_rope_freq_base = config.get("rope_theta", f_rope_freq_base),
        )

    @staticmethod