mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-25 05:48:47 +01:00
gguf : add BERT, MPT, and GPT-J arch info (#3408)
This commit is contained in:
parent
0fe321031a
commit
29a404a951
@ -85,10 +85,12 @@ class MODEL_ARCH(IntEnum):
|
|||||||
GPTNEOX : int = auto()
|
GPTNEOX : int = auto()
|
||||||
MPT : int = auto()
|
MPT : int = auto()
|
||||||
STARCODER : int = auto()
|
STARCODER : int = auto()
|
||||||
|
BERT : int = auto()
|
||||||
|
|
||||||
|
|
||||||
class MODEL_TENSOR(IntEnum):
|
class MODEL_TENSOR(IntEnum):
|
||||||
TOKEN_EMBD : int = auto()
|
TOKEN_EMBD : int = auto()
|
||||||
|
TOKEN_TYPES : int = auto()
|
||||||
POS_EMBD : int = auto()
|
POS_EMBD : int = auto()
|
||||||
OUTPUT : int = auto()
|
OUTPUT : int = auto()
|
||||||
OUTPUT_NORM : int = auto()
|
OUTPUT_NORM : int = auto()
|
||||||
@ -116,10 +118,12 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
|||||||
MODEL_ARCH.GPTNEOX: "gptneox",
|
MODEL_ARCH.GPTNEOX: "gptneox",
|
||||||
MODEL_ARCH.MPT: "mpt",
|
MODEL_ARCH.MPT: "mpt",
|
||||||
MODEL_ARCH.STARCODER: "starcoder",
|
MODEL_ARCH.STARCODER: "starcoder",
|
||||||
|
MODEL_ARCH.BERT: "bert",
|
||||||
}
|
}
|
||||||
|
|
||||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
||||||
|
MODEL_TENSOR.TOKEN_TYPES: "token_types",
|
||||||
MODEL_TENSOR.POS_EMBD: "position_embd",
|
MODEL_TENSOR.POS_EMBD: "position_embd",
|
||||||
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
||||||
MODEL_TENSOR.OUTPUT: "output",
|
MODEL_TENSOR.OUTPUT: "output",
|
||||||
@ -206,6 +210,43 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||||||
MODEL_TENSOR.FFN_DOWN,
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
MODEL_TENSOR.FFN_UP,
|
MODEL_TENSOR.FFN_UP,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.BERT: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.TOKEN_TYPES,
|
||||||
|
MODEL_TENSOR.POS_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
],
|
||||||
|
MODEL_ARCH.MPT: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_QKV,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
],
|
||||||
|
MODEL_ARCH.GPTJ: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
],
|
||||||
MODEL_ARCH.GPT2: [
|
MODEL_ARCH.GPT2: [
|
||||||
# TODO
|
# TODO
|
||||||
],
|
],
|
||||||
@ -229,31 +270,40 @@ class TensorNameMap:
|
|||||||
mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
|
mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
|
||||||
# Token embeddings
|
# Token embeddings
|
||||||
MODEL_TENSOR.TOKEN_EMBD: (
|
MODEL_TENSOR.TOKEN_EMBD: (
|
||||||
"gpt_neox.embed_in", # gptneox
|
"gpt_neox.embed_in", # gptneox
|
||||||
"transformer.wte", # gpt2 mpt
|
"transformer.wte", # gpt2 gpt-j mpt
|
||||||
"transformer.word_embeddings", # falcon
|
"transformer.word_embeddings", # falcon
|
||||||
"model.embed_tokens", # llama-hf
|
"model.embed_tokens", # llama-hf
|
||||||
"tok_embeddings", # llama-pth
|
"tok_embeddings", # llama-pth
|
||||||
|
"embeddings.word_embeddings", # bert
|
||||||
|
),
|
||||||
|
|
||||||
|
# Token type embeddings
|
||||||
|
MODEL_TENSOR.TOKEN_TYPES: (
|
||||||
|
"embeddings.token_type_embeddings", # bert
|
||||||
),
|
),
|
||||||
|
|
||||||
# Position embeddings
|
# Position embeddings
|
||||||
MODEL_TENSOR.POS_EMBD: (
|
MODEL_TENSOR.POS_EMBD: (
|
||||||
"transformer.wpe", # gpt2
|
"transformer.wpe", # gpt2
|
||||||
|
"embeddings.position_embeddings", # bert
|
||||||
),
|
),
|
||||||
|
|
||||||
# Output
|
# Output
|
||||||
MODEL_TENSOR.OUTPUT: (
|
MODEL_TENSOR.OUTPUT: (
|
||||||
"embed_out", # gptneox
|
"embed_out", # gptneox
|
||||||
"lm_head", # gpt2 mpt falcon llama-hf baichuan
|
"lm_head", # gpt2 gpt-j mpt falcon llama-hf baichuan
|
||||||
"output", # llama-pth
|
"output", # llama-pth
|
||||||
),
|
),
|
||||||
|
|
||||||
# Output norm
|
# Output norm
|
||||||
MODEL_TENSOR.OUTPUT_NORM: (
|
MODEL_TENSOR.OUTPUT_NORM: (
|
||||||
"gpt_neox.final_layer_norm", # gptneox
|
"gpt_neox.final_layer_norm", # gptneox
|
||||||
"transformer.ln_f", # gpt2 falcon
|
"transformer.ln_f", # gpt2 gpt-j falcon
|
||||||
"model.norm", # llama-hf baichuan
|
"model.norm", # llama-hf baichuan
|
||||||
"norm", # llama-pth
|
"norm", # llama-pth
|
||||||
|
"embeddings.LayerNorm", # bert
|
||||||
|
"transformer.norm_f", # mpt
|
||||||
),
|
),
|
||||||
|
|
||||||
# Rope frequencies
|
# Rope frequencies
|
||||||
@ -265,13 +315,14 @@ class TensorNameMap:
|
|||||||
block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
|
block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
|
||||||
# Attention norm
|
# Attention norm
|
||||||
MODEL_TENSOR.ATTN_NORM: (
|
MODEL_TENSOR.ATTN_NORM: (
|
||||||
"gpt_neox.layers.{bid}.input_layernorm", # gptneox
|
"gpt_neox.layers.{bid}.input_layernorm", # gptneox
|
||||||
"transformer.h.{bid}.ln_1", # gpt2
|
"transformer.h.{bid}.ln_1", # gpt2 gpt-j
|
||||||
"transformer.blocks.{bid}.norm_1", # mpt
|
"transformer.blocks.{bid}.norm_1", # mpt
|
||||||
"transformer.h.{bid}.input_layernorm", # falcon7b
|
"transformer.h.{bid}.input_layernorm", # falcon7b
|
||||||
"transformer.h.{bid}.ln_mlp", # falcon40b
|
"transformer.h.{bid}.ln_mlp", # falcon40b
|
||||||
"model.layers.{bid}.input_layernorm", # llama-hf
|
"model.layers.{bid}.input_layernorm", # llama-hf
|
||||||
"layers.{bid}.attention_norm", # llama-pth
|
"layers.{bid}.attention_norm", # llama-pth
|
||||||
|
"encoder.layer.{bid}.attention.output.LayerNorm", # bert
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention norm 2
|
# Attention norm 2
|
||||||
@ -281,38 +332,46 @@ class TensorNameMap:
|
|||||||
|
|
||||||
# Attention query-key-value
|
# Attention query-key-value
|
||||||
MODEL_TENSOR.ATTN_QKV: (
|
MODEL_TENSOR.ATTN_QKV: (
|
||||||
"gpt_neox.layers.{bid}.attention.query_key_value", # gptneox
|
"gpt_neox.layers.{bid}.attention.query_key_value", # gptneox
|
||||||
"transformer.h.{bid}.attn.c_attn", # gpt2
|
"transformer.h.{bid}.attn.c_attn", # gpt2
|
||||||
"transformer.blocks.{bid}.attn.Wqkv", # mpt
|
"transformer.blocks.{bid}.attn.Wqkv", # mpt
|
||||||
"transformer.h.{bid}.self_attention.query_key_value", # falcon
|
"transformer.h.{bid}.self_attention.query_key_value", # falcon
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention query
|
# Attention query
|
||||||
MODEL_TENSOR.ATTN_Q: (
|
MODEL_TENSOR.ATTN_Q: (
|
||||||
"model.layers.{bid}.self_attn.q_proj", # llama-hf
|
"model.layers.{bid}.self_attn.q_proj", # llama-hf
|
||||||
"layers.{bid}.attention.wq", # llama-pth
|
"layers.{bid}.attention.wq", # llama-pth
|
||||||
|
"encoder.layer.{bid}.attention.self.query", # bert
|
||||||
|
"transformer.h.{bid}.attn.q_proj", # gpt-j
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention key
|
# Attention key
|
||||||
MODEL_TENSOR.ATTN_K: (
|
MODEL_TENSOR.ATTN_K: (
|
||||||
"model.layers.{bid}.self_attn.k_proj", # llama-hf
|
"model.layers.{bid}.self_attn.k_proj", # llama-hf
|
||||||
"layers.{bid}.attention.wk", # llama-pth
|
"layers.{bid}.attention.wk", # llama-pth
|
||||||
|
"encoder.layer.{bid}.attention.self.key", # bert
|
||||||
|
"transformer.h.{bid}.attn.k_proj", # gpt-j
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention value
|
# Attention value
|
||||||
MODEL_TENSOR.ATTN_V: (
|
MODEL_TENSOR.ATTN_V: (
|
||||||
"model.layers.{bid}.self_attn.v_proj", # llama-hf
|
"model.layers.{bid}.self_attn.v_proj", # llama-hf
|
||||||
"layers.{bid}.attention.wv", # llama-pth
|
"layers.{bid}.attention.wv", # llama-pth
|
||||||
|
"encoder.layer.{bid}.attention.self.value", # bert
|
||||||
|
"transformer.h.{bid}.attn.v_proj", # gpt-j
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention output
|
# Attention output
|
||||||
MODEL_TENSOR.ATTN_OUT: (
|
MODEL_TENSOR.ATTN_OUT: (
|
||||||
"gpt_neox.layers.{bid}.attention.dense", # gptneox
|
"gpt_neox.layers.{bid}.attention.dense", # gptneox
|
||||||
"transformer.h.{bid}.attn.c_proj", # gpt2
|
"transformer.h.{bid}.attn.c_proj", # gpt2
|
||||||
"transformer.blocks.{bid}.attn.out_proj", # mpt
|
"transformer.blocks.{bid}.attn.out_proj", # mpt
|
||||||
"transformer.h.{bid}.self_attention.dense", # falcon
|
"transformer.h.{bid}.self_attention.dense", # falcon
|
||||||
"model.layers.{bid}.self_attn.o_proj", # llama-hf
|
"model.layers.{bid}.self_attn.o_proj", # llama-hf
|
||||||
"layers.{bid}.attention.wo", # llama-pth
|
"layers.{bid}.attention.wo", # llama-pth
|
||||||
|
"encoder.layer.{bid}.attention.output.dense", # bert
|
||||||
|
"transformer.h.{bid}.attn.out_proj", # gpt-j
|
||||||
),
|
),
|
||||||
|
|
||||||
# Rotary embeddings
|
# Rotary embeddings
|
||||||
@ -323,21 +382,24 @@ class TensorNameMap:
|
|||||||
|
|
||||||
# Feed-forward norm
|
# Feed-forward norm
|
||||||
MODEL_TENSOR.FFN_NORM: (
|
MODEL_TENSOR.FFN_NORM: (
|
||||||
"gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
|
"gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
|
||||||
"transformer.h.{bid}.ln_2", # gpt2
|
"transformer.h.{bid}.ln_2", # gpt2
|
||||||
"transformer.blocks.{bid}.norm_2", # mpt
|
"transformer.blocks.{bid}.norm_2", # mpt
|
||||||
"model.layers.{bid}.post_attention_layernorm", # llama-hf
|
"model.layers.{bid}.post_attention_layernorm", # llama-hf
|
||||||
"layers.{bid}.ffn_norm", # llama-pth
|
"layers.{bid}.ffn_norm", # llama-pth
|
||||||
|
"encoder.layer.{bid}.output.LayerNorm", # bert
|
||||||
),
|
),
|
||||||
|
|
||||||
# Feed-forward up
|
# Feed-forward up
|
||||||
MODEL_TENSOR.FFN_UP: (
|
MODEL_TENSOR.FFN_UP: (
|
||||||
"gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
|
"gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
|
||||||
"transformer.h.{bid}.mlp.c_fc", # gpt2
|
"transformer.h.{bid}.mlp.c_fc", # gpt2
|
||||||
"transformer.blocks.{bid}.ffn.up_proj", # mpt
|
"transformer.blocks.{bid}.ffn.up_proj", # mpt
|
||||||
"transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
|
"transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
|
||||||
"model.layers.{bid}.mlp.up_proj", # llama-hf
|
"model.layers.{bid}.mlp.up_proj", # llama-hf
|
||||||
"layers.{bid}.feed_forward.w3", # llama-pth
|
"layers.{bid}.feed_forward.w3", # llama-pth
|
||||||
|
"encoder.layer.{bid}.intermediate.dense", # bert
|
||||||
|
"transformer.h.{bid}.mlp.fc_in", # gpt-j
|
||||||
),
|
),
|
||||||
|
|
||||||
# Feed-forward gate
|
# Feed-forward gate
|
||||||
@ -348,12 +410,14 @@ class TensorNameMap:
|
|||||||
|
|
||||||
# Feed-forward down
|
# Feed-forward down
|
||||||
MODEL_TENSOR.FFN_DOWN: (
|
MODEL_TENSOR.FFN_DOWN: (
|
||||||
"gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
|
"gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
|
||||||
"transformer.h.{bid}.mlp.c_proj", # gpt2
|
"transformer.h.{bid}.mlp.c_proj", # gpt2
|
||||||
"transformer.blocks.{bid}.ffn.down_proj", # mpt
|
"transformer.blocks.{bid}.ffn.down_proj", # mpt
|
||||||
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
|
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
|
||||||
"model.layers.{bid}.mlp.down_proj", # llama-hf
|
"model.layers.{bid}.mlp.down_proj", # llama-hf
|
||||||
"layers.{bid}.feed_forward.w2", # llama-pth
|
"layers.{bid}.feed_forward.w2", # llama-pth
|
||||||
|
"encoder.layer.{bid}.output.dense", # bert
|
||||||
|
"transformer.h.{bid}.mlp.fc_out", # gpt-j
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user