convert : support safetensors format

2025-01-01 00:39:00 +01:00 · 2023-12-12 13:04:33 +02:00 · 2023-12-12 13:04:33 +02:00 · 6a419f4d19
commit 6a419f4d19
parent f1cbfabd64
2 changed files with 20 additions and 6 deletions
--- a/convert.py
+++ b/convert.py
@ -42,6 +42,7 @@ NDArray: TypeAlias = 'np.ndarray[Any, Any]'
 ARCH = gguf.MODEL_ARCH.LLAMA

 DEFAULT_CONCURRENCY = 8
+
 #
 # data types
 #
@ -235,6 +236,13 @@ class Params:
            raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
                            "Suggestion: provide 'config.json' of the model in the same directory containing model files.")

+        n_experts      = None
+        n_experts_used = None
+
+        if "num_local_experts" in config:
+            n_experts = config["num_local_experts"]
+            n_experts_used = config["num_experts_per_tok"]
+
        return Params(
            n_vocab           = config["vocab_size"],
            n_embd            = config["hidden_size"],
@ -243,6 +251,8 @@ class Params:
            n_ff              = config["intermediate_size"],
            n_head            = (n_head := config["num_attention_heads"]),
            n_head_kv         = config.get("num_key_value_heads", n_head),
+            n_experts         = n_experts,
+            n_experts_used    = n_experts_used,
            f_norm_eps        = config["rms_norm_eps"],
            f_rope_freq_base  = config.get("rope_theta"),
            rope_scaling_type = rope_scaling_type,
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@ -151,6 +151,7 @@ class TensorNameMap:

        MODEL_TENSOR.FFN_GATE_INP: (
            "layers.{bid}.feed_forward.gate",           # mixtral
+            "model.layers.{bid}.block_sparse_moe.gate", # mixtral
        ),

        # Feed-forward up
@ -170,6 +171,7 @@ class TensorNameMap:

        MODEL_TENSOR.FFN_UP_EXP: (
            "layers.{bid}.feed_forward.experts.{xid}.w3",           # mixtral
+            "model.layers.{bid}.block_sparse_moe.experts.{xid}.w3", # mixtral
        ),

        # Feed-forward gate
@ -181,6 +183,7 @@ class TensorNameMap:

        MODEL_TENSOR.FFN_GATE_EXP: (
            "layers.{bid}.feed_forward.experts.{xid}.w1",           # mixtral
+            "model.layers.{bid}.block_sparse_moe.experts.{xid}.w1", # mixtral
        ),

        # Feed-forward down
@ -199,6 +202,7 @@ class TensorNameMap:

        MODEL_TENSOR.FFN_DOWN_EXP: (
            "layers.{bid}.feed_forward.experts.{xid}.w2",           # mixtral
+            "model.layers.{bid}.block_sparse_moe.experts.{xid}.w2", # mixtral
        ),

        MODEL_TENSOR.ATTN_Q_NORM: (