convert : support safetensors format

This commit is contained in:
Georgi Gerganov 2023-12-12 13:04:33 +02:00
parent f1cbfabd64
commit 6a419f4d19
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
2 changed files with 20 additions and 6 deletions

View File

@ -42,6 +42,7 @@ NDArray: TypeAlias = 'np.ndarray[Any, Any]'
ARCH = gguf.MODEL_ARCH.LLAMA ARCH = gguf.MODEL_ARCH.LLAMA
DEFAULT_CONCURRENCY = 8 DEFAULT_CONCURRENCY = 8
# #
# data types # data types
# #
@ -235,6 +236,13 @@ class Params:
raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n" raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
"Suggestion: provide 'config.json' of the model in the same directory containing model files.") "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
n_experts = None
n_experts_used = None
if "num_local_experts" in config:
n_experts = config["num_local_experts"]
n_experts_used = config["num_experts_per_tok"]
return Params( return Params(
n_vocab = config["vocab_size"], n_vocab = config["vocab_size"],
n_embd = config["hidden_size"], n_embd = config["hidden_size"],
@ -243,6 +251,8 @@ class Params:
n_ff = config["intermediate_size"], n_ff = config["intermediate_size"],
n_head = (n_head := config["num_attention_heads"]), n_head = (n_head := config["num_attention_heads"]),
n_head_kv = config.get("num_key_value_heads", n_head), n_head_kv = config.get("num_key_value_heads", n_head),
n_experts = n_experts,
n_experts_used = n_experts_used,
f_norm_eps = config["rms_norm_eps"], f_norm_eps = config["rms_norm_eps"],
f_rope_freq_base = config.get("rope_theta"), f_rope_freq_base = config.get("rope_theta"),
rope_scaling_type = rope_scaling_type, rope_scaling_type = rope_scaling_type,
@ -257,7 +267,7 @@ class Params:
def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params: def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
config = json.load(open(config_path)) config = json.load(open(config_path))
n_experts = None n_experts = None
n_experts_used = None n_experts_used = None
f_rope_freq_base = None f_rope_freq_base = None
@ -280,7 +290,7 @@ class Params:
if config.get("moe"): if config.get("moe"):
n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0] n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0]
n_experts = config["moe"]["num_experts"] n_experts = config["moe"]["num_experts"]
n_experts_used = config["moe"]["num_experts_per_tok"] n_experts_used = config["moe"]["num_experts_per_tok"]
f_rope_freq_base = 1e6 f_rope_freq_base = 1e6

View File

@ -150,7 +150,8 @@ class TensorNameMap:
), ),
MODEL_TENSOR.FFN_GATE_INP: ( MODEL_TENSOR.FFN_GATE_INP: (
"layers.{bid}.feed_forward.gate", # mixtral "layers.{bid}.feed_forward.gate", # mixtral
"model.layers.{bid}.block_sparse_moe.gate", # mixtral
), ),
# Feed-forward up # Feed-forward up
@ -169,7 +170,8 @@ class TensorNameMap:
), ),
MODEL_TENSOR.FFN_UP_EXP: ( MODEL_TENSOR.FFN_UP_EXP: (
"layers.{bid}.feed_forward.experts.{xid}.w3", # mixtral "layers.{bid}.feed_forward.experts.{xid}.w3", # mixtral
"model.layers.{bid}.block_sparse_moe.experts.{xid}.w3", # mixtral
), ),
# Feed-forward gate # Feed-forward gate
@ -180,7 +182,8 @@ class TensorNameMap:
), ),
MODEL_TENSOR.FFN_GATE_EXP: ( MODEL_TENSOR.FFN_GATE_EXP: (
"layers.{bid}.feed_forward.experts.{xid}.w1", # mixtral "layers.{bid}.feed_forward.experts.{xid}.w1", # mixtral
"model.layers.{bid}.block_sparse_moe.experts.{xid}.w1", # mixtral
), ),
# Feed-forward down # Feed-forward down
@ -198,7 +201,8 @@ class TensorNameMap:
), ),
MODEL_TENSOR.FFN_DOWN_EXP: ( MODEL_TENSOR.FFN_DOWN_EXP: (
"layers.{bid}.feed_forward.experts.{xid}.w2", # mixtral "layers.{bid}.feed_forward.experts.{xid}.w2", # mixtral
"model.layers.{bid}.block_sparse_moe.experts.{xid}.w2", # mixtral
), ),
MODEL_TENSOR.ATTN_Q_NORM: ( MODEL_TENSOR.ATTN_Q_NORM: (