diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 80a179b86..b931049d1 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -3040,9 +3040,9 @@ class OlmoModel(Model): return [(self.map_tensor_name(name), data_torch)] -@Model.register("Olmo1124ForCausalLM") -class Olmo1124Model(Model): - model_arch = gguf.MODEL_ARCH.OLMO_1124 +@Model.register("Olmo2ForCausalLM") +class Olmo2Model(Model): + model_arch = gguf.MODEL_ARCH.OLMO2 @Model.register("OlmoeForCausalLM") diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index d83b72f76..7df23371c 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -243,7 +243,7 @@ class MODEL_ARCH(IntEnum): COMMAND_R = auto() DBRX = auto() OLMO = auto() - OLMO_1124 = auto() + OLMO2 = auto() OLMOE = auto() OPENELM = auto() ARCTIC = auto() @@ -405,7 +405,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.COMMAND_R: "command-r", MODEL_ARCH.DBRX: "dbrx", MODEL_ARCH.OLMO: "olmo", - MODEL_ARCH.OLMO_1124: "olmo_1124", + MODEL_ARCH.OLMO2: "olmo2", MODEL_ARCH.OLMOE: "olmoe", MODEL_ARCH.OPENELM: "openelm", MODEL_ARCH.ARCTIC: "arctic", @@ -1071,7 +1071,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], - MODEL_ARCH.OLMO_1124: [ + MODEL_ARCH.OLMO2: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 4cbd39e03..1b6a3f4ad 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -13,7 +13,7 @@ class TensorNameMap: "transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais exaone "transformer.word_embeddings", # falcon "word_embeddings", # bloom - "model.embed_tokens", # llama-hf nemotron olmoe olmo_1124 + "model.embed_tokens", # llama-hf nemotron olmoe olmo2 "tok_embeddings", # llama-pth "embeddings.word_embeddings", # bert nomic-bert "language_model.embedding.word_embeddings", # persimmon @@ -54,7 +54,7 @@ class TensorNameMap: # Output MODEL_TENSOR.OUTPUT: ( "embed_out", # gptneox - "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo_1124 + "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo2 "output", # llama-pth bloom internlm2 "word_embeddings_for_head", # persimmon "lm_head.linear", # phi2 @@ -66,7 +66,7 @@ class TensorNameMap: MODEL_TENSOR.OUTPUT_NORM: ( "gpt_neox.final_layer_norm", # gptneox "transformer.ln_f", # gpt2 gpt-j falcon jais exaone - "model.norm", # llama-hf baichuan internlm2 olmoe olmo_1124 + "model.norm", # llama-hf baichuan internlm2 olmoe olmo2 "norm", # llama-pth "transformer.norm_f", # mpt dbrx "ln_f", # refact bloom qwen gpt2 @@ -145,7 +145,7 @@ class TensorNameMap: # Attention query MODEL_TENSOR.ATTN_Q: ( - "model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron olmoe olmo_1124 + "model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron olmoe olmo2 "layers.{bid}.attention.wq", # llama-pth "encoder.layer.{bid}.attention.self.query", # bert "transformer.h.{bid}.attn.q_proj", # gpt-j @@ -157,7 +157,7 @@ class TensorNameMap: # Attention key MODEL_TENSOR.ATTN_K: ( - "model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron olmoe olmo_1124 + "model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron olmoe olmo2 "layers.{bid}.attention.wk", # llama-pth "encoder.layer.{bid}.attention.self.key", # bert "transformer.h.{bid}.attn.k_proj", # gpt-j @@ -170,7 +170,7 @@ class TensorNameMap: # Attention value MODEL_TENSOR.ATTN_V: ( - "model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron olmoe olmo_1124 + "model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron olmoe olmo2 "layers.{bid}.attention.wv", # llama-pth "encoder.layer.{bid}.attention.self.value", # bert "transformer.h.{bid}.attn.v_proj", # gpt-j @@ -188,7 +188,7 @@ class TensorNameMap: "transformer.blocks.{bid}.attn.out_proj", # mpt "transformer.h.{bid}.self_attention.dense", # falcon "h.{bid}.self_attention.dense", # bloom - "model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe olmo_1124 + "model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe olmo2 "layers.{bid}.attention.wo", # llama-pth "encoder.layer.{bid}.attention.output.dense", # bert "transformer.h.{bid}.attn.out_proj", # gpt-j @@ -215,7 +215,7 @@ class TensorNameMap: ), MODEL_TENSOR.ATTN_POST_NORM: ( - "model.layers.{bid}.post_attention_layernorm", # gemma2 olmo_1124 + "model.layers.{bid}.post_attention_layernorm", # gemma2 olmo2 ), # Rotary embeddings @@ -250,7 +250,7 @@ class TensorNameMap: # Post feed-forward norm MODEL_TENSOR.FFN_POST_NORM: ( - "model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo_1124 + "model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2 ), MODEL_TENSOR.FFN_GATE_INP: ( @@ -273,7 +273,7 @@ class TensorNameMap: "transformer.blocks.{bid}.ffn.up_proj", # mpt "transformer.h.{bid}.mlp.dense_h_to_4h", # falcon "h.{bid}.mlp.dense_h_to_4h", # bloom - "model.layers.{bid}.mlp.up_proj", # llama-hf refact nemotron olmo_1124 + "model.layers.{bid}.mlp.up_proj", # llama-hf refact nemotron olmo2 "layers.{bid}.feed_forward.w3", # llama-pth "encoder.layer.{bid}.intermediate.dense", # bert "transformer.h.{bid}.mlp.fc_in", # gpt-j @@ -314,7 +314,7 @@ class TensorNameMap: # Feed-forward gate MODEL_TENSOR.FFN_GATE: ( - "model.layers.{bid}.mlp.gate_proj", # llama-hf refact olmo_1124 + "model.layers.{bid}.mlp.gate_proj", # llama-hf refact olmo2 "layers.{bid}.feed_forward.w1", # llama-pth "transformer.h.{bid}.mlp.w2", # qwen "transformer.h.{bid}.mlp.c_fc2", # jais @@ -346,7 +346,7 @@ class TensorNameMap: "transformer.blocks.{bid}.ffn.down_proj", # mpt "transformer.h.{bid}.mlp.dense_4h_to_h", # falcon "h.{bid}.mlp.dense_4h_to_h", # bloom - "model.layers.{bid}.mlp.down_proj", # llama-hf nemotron olmo_1124 + "model.layers.{bid}.mlp.down_proj", # llama-hf nemotron olmo2 "layers.{bid}.feed_forward.w2", # llama-pth "encoder.layer.{bid}.output.dense", # bert "transformer.h.{bid}.mlp.fc_out", # gpt-j @@ -383,7 +383,7 @@ class TensorNameMap: MODEL_TENSOR.ATTN_Q_NORM: ( "language_model.encoder.layers.{bid}.self_attention.q_layernorm", "model.layers.{bid}.self_attn.q_layernorm", # persimmon - "model.layers.{bid}.self_attn.q_norm", # cohere olmoe chameleon olmo_1124 + "model.layers.{bid}.self_attn.q_norm", # cohere olmoe chameleon olmo2 "transformer.blocks.{bid}.attn.q_ln", # sea-lion "encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2 "transformer.layers.{bid}.attn.q_norm", # openelm @@ -392,7 +392,7 @@ class TensorNameMap: MODEL_TENSOR.ATTN_K_NORM: ( "language_model.encoder.layers.{bid}.self_attention.k_layernorm", "model.layers.{bid}.self_attn.k_layernorm", # persimmon - "model.layers.{bid}.self_attn.k_norm", # cohere olmoe chameleon olmo_1124 + "model.layers.{bid}.self_attn.k_norm", # cohere olmoe chameleon olmo2 "transformer.blocks.{bid}.attn.k_ln", # sea-lion "encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2 "transformer.layers.{bid}.attn.k_norm", # openelm diff --git a/src/llama.cpp b/src/llama.cpp index 571cb68e2..af5e686e0 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -179,7 +179,7 @@ enum llm_arch { LLM_ARCH_COMMAND_R, LLM_ARCH_DBRX, LLM_ARCH_OLMO, - LLM_ARCH_OLMO_1124, + LLM_ARCH_OLMO2, LLM_ARCH_OLMOE, LLM_ARCH_OPENELM, LLM_ARCH_ARCTIC, @@ -233,7 +233,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_COMMAND_R, "command-r" }, { LLM_ARCH_DBRX, "dbrx" }, { LLM_ARCH_OLMO, "olmo" }, - { LLM_ARCH_OLMO_1124, "olmo_1124" }, + { LLM_ARCH_OLMO2, "olmo2" }, { LLM_ARCH_OLMOE, "olmoe" }, { LLM_ARCH_OPENELM, "openelm" }, { LLM_ARCH_ARCTIC, "arctic" }, @@ -1210,7 +1210,7 @@ static const std::map> LLM_TENSOR_N }, }, { - LLM_ARCH_OLMO_1124, + LLM_ARCH_OLMO2, { { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, @@ -5900,7 +5900,7 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; - case LLM_ARCH_OLMO_1124: + case LLM_ARCH_OLMO2: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -8593,7 +8593,7 @@ static bool llm_load_tensors( layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); } } break; - case LLM_ARCH_OLMO_1124: + case LLM_ARCH_OLMO2: { model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -14483,7 +14483,7 @@ struct llm_build_context { return gf; } - struct ggml_cgraph * build_olmo_1124() { + struct ggml_cgraph * build_olmo2() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); // mutable variable, needed during the last layer of the computation to skip unused tokens @@ -16799,9 +16799,9 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_olmo(); } break; - case LLM_ARCH_OLMO_1124: + case LLM_ARCH_OLMO2: { - result = llm.build_olmo_1124(); + result = llm.build_olmo2(); } break; case LLM_ARCH_OLMOE: { @@ -20084,7 +20084,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_QWEN: case LLM_ARCH_QWEN2: case LLM_ARCH_QWEN2MOE: - case LLM_ARCH_OLMO_1124: + case LLM_ARCH_OLMO2: case LLM_ARCH_OLMOE: case LLM_ARCH_PHI2: case LLM_ARCH_PHI3: