From 15499eb94227401bdc8875da6eb85c15d37068f7 Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Thu, 22 Feb 2024 17:05:23 -0500 Subject: [PATCH] mpt : do not duplicate token_embd.weight on disk (#5670) --- convert-hf-to-gguf.py | 5 ----- llama.cpp | 6 ++++-- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 481198dad..9bdfce07a 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -622,11 +622,6 @@ class MPTModel(Model): self.gguf_writer.add_tensor(new_name, data) - # note: MPT output is tied to (same as) wte in original model; - # for easier implementation in llama.cpp it's duplicated in GGUF, though :/ - if new_name == "token_embd.weight": - self.gguf_writer.add_tensor("output.weight", data) - class OrionModel(Model): def set_vocab(self): diff --git a/llama.cpp b/llama.cpp index 2ebd40df2..37477e6ef 100644 --- a/llama.cpp +++ b/llama.cpp @@ -509,7 +509,6 @@ static std::map> LLM_TENSOR_NAMES = { { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, @@ -4056,7 +4055,10 @@ static bool llm_load_tensors( model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false); - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + // same as tok_embd, duplicated to allow offloading + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + ml.n_created--; // artificial tensor + ml.size_data += ggml_nbytes(model.output); } for (int i = 0; i < n_layer; ++i) {