From 8586d23c8abdb17e28174fb0ebd12c27adfdadd3 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 23 Jan 2025 12:14:06 +0100 Subject: [PATCH] minicpm working without uhd --- convert_hf_to_gguf.py | 25 +++++++++++++++++++++++++ examples/vision/vision.cpp | 2 +- gguf-py/gguf/constants.py | 12 ++++++++++++ gguf-py/gguf/tensor_mapping.py | 16 ++++++++++++++++ src/llama-arch.cpp | 4 ++++ src/llama-arch.h | 4 ++++ src/llama-model.cpp | 6 ++++++ src/llama-vision.cpp | 4 ++++ src/llama-vision.h | 6 +++++- 9 files changed, 77 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 9a05e9960..e703cd33d 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2339,6 +2339,7 @@ class MiniCPMVModel(Qwen2Model): model_arch = gguf.MODEL_ARCH.QWEN2 proj_type: gguf.constants.CLIPProjectorType | None resampler_n_embd = 0 + tok_embd_tensor: Tensor | None = None def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -2361,6 +2362,8 @@ class MiniCPMVModel(Qwen2Model): for tname, tensor in self.get_tensors(): if tname == "resampler.ln_post.bias": self.resampler_n_embd = tensor.shape[0] + if tname.endswith("embed_tokens.weight"): + self.tok_embd_tensor = tensor if self.resampler_n_embd < 2: raise ValueError("Failed to detect resampler embedding size") else: @@ -2372,6 +2375,16 @@ class MiniCPMVModel(Qwen2Model): self.hparams["vision_feature_layer"] = 0 self.v_tensor_map = gguf.get_tensor_name_map(self.vision_arch, self.vparams["num_hidden_layers"]) + def get_embd_of_tokens(self, map_token_to_tensor_name: Iterable[tuple[str, str]]) -> Iterable[tuple[str, Tensor]]: + if self.tok_embd_tensor is None: + raise ValueError("Token embedding tensor not found") + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) + for token, tensor_name in map_token_to_tensor_name: + tok_id = tokenizer.get_vocab()[token] + row = self.tok_embd_tensor[tok_id] + yield tensor_name, row + def set_gguf_parameters(self): super().set_gguf_parameters() # For vision model @@ -2388,6 +2401,14 @@ class MiniCPMVModel(Qwen2Model): self.format_tensor_name(gguf.MODEL_TENSOR.V_RESMPL_POS_EMBD_K, is_vision=True), torch.from_numpy(self._get_2d_sincos_pos_embed(self.resampler_n_embd, (70, 70))) ) + added_tokens = [ + ("", gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_TOK_EMBD_IMAGE ] + ".weight"), + ("", gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_TOK_EMBD_END_IMAGE] + ".weight"), + ("", gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_TOK_EMBD_SLICE ] + ".weight"), + ("", gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_TOK_EMBD_END_SLICE] + ".weight"), + ] + for tensor_name, tensor in self.get_embd_of_tokens(added_tokens): + yield tensor_name, tensor def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused @@ -2404,6 +2425,7 @@ class MiniCPMVModel(Qwen2Model): name_k = name.replace("in_proj_", "in_proj_k.") # in_proj_k.(weight|bias) name_v = name.replace("in_proj_", "in_proj_v.") # in_proj_v.(weight|bias) return [ + # TODO: permute these (self.map_tensor_name(name_q), split_tensor[0]), (self.map_tensor_name(name_k), split_tensor[1]), (self.map_tensor_name(name_v), split_tensor[2]), @@ -2413,6 +2435,9 @@ class MiniCPMVModel(Qwen2Model): if name == "resampler.proj" or name == "resampler.query": name += ".weight" + if name.startswith("resampler.proj"): + data_torch = data_torch.transpose(-1, -2).contiguous() + if "post_layernorm" in name: return [] # skip post_layernorm diff --git a/examples/vision/vision.cpp b/examples/vision/vision.cpp index d994535f6..bb2cdbf4e 100644 --- a/examples/vision/vision.cpp +++ b/examples/vision/vision.cpp @@ -100,7 +100,7 @@ int main(int argc, char ** argv) { // default prompt for llava 1.5 //params.prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:\nwhat did you see?\nASSISTANT:"; // default prompt for minicpmv 2.6 - params.prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nwhat did you see?\n<|im_end|>\n<|im_start|>assistant\n"; + params.prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nwhat did you see?\n<|im_end|>\n<|im_start|>assistant\n"; params.n_predict = 64; params.n_batch = 2048; params.n_ubatch = 1024; diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 6cc9609fc..f4da3e234 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -467,6 +467,10 @@ class MODEL_TENSOR(IntEnum): V_RESMPL_Q_NORM = auto() # minicpmv V_RESMPL_PROJ = auto() # minicpmv V_RESMPL_QUERY = auto() # minicpmv + V_TOK_EMBD_IMAGE = auto() # embedding for token + V_TOK_EMBD_END_IMAGE = auto() # embedding for token + V_TOK_EMBD_SLICE = auto() # embedding for token + V_TOK_EMBD_END_SLICE = auto() # embedding for token MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { @@ -686,6 +690,10 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.V_RESMPL_Q_NORM: "v.resmpl.q_norm", MODEL_TENSOR.V_RESMPL_PROJ: "v.resmpl.proj", MODEL_TENSOR.V_RESMPL_QUERY: "v.resmpl.query", + MODEL_TENSOR.V_TOK_EMBD_IMAGE: "v.tok_embd.image", + MODEL_TENSOR.V_TOK_EMBD_END_IMAGE: "v.tok_embd.end_image", + MODEL_TENSOR.V_TOK_EMBD_SLICE: "v.tok_embd.slice", + MODEL_TENSOR.V_TOK_EMBD_END_SLICE: "v.tok_embd.end_slice", } MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { @@ -1682,6 +1690,10 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.V_RESMPL_Q_NORM, MODEL_TENSOR.V_RESMPL_PROJ, MODEL_TENSOR.V_RESMPL_QUERY, + MODEL_TENSOR.V_TOK_EMBD_IMAGE, + MODEL_TENSOR.V_TOK_EMBD_END_IMAGE, + MODEL_TENSOR.V_TOK_EMBD_SLICE, + MODEL_TENSOR.V_TOK_EMBD_END_SLICE, ], # TODO } diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index b756ec184..0228e8400 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -907,6 +907,22 @@ class TensorNameMap: MODEL_TENSOR.V_RESMPL_QUERY: ( "resampler.query", ), + + MODEL_TENSOR.V_TOK_EMBD_IMAGE:( + "v.tok_embd.image", # tensor generated from token embeddings + ), + + MODEL_TENSOR.V_TOK_EMBD_END_IMAGE:( + "v.tok_embd.end_image", # tensor generated from token embeddings + ), + + MODEL_TENSOR.V_TOK_EMBD_SLICE:( + "v.tok_embd.slice", # tensor generated from token embeddings + ), + + MODEL_TENSOR.V_TOK_EMBD_END_SLICE:( + "v.tok_embd.end_slice", # tensor generated from token embeddings + ), } # architecture-specific block mappings diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 0b20b03f6..1a6d45331 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -1382,6 +1382,10 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_V_RESMPL_Q_NORM, "v.resmpl.q_norm" }, { LLM_TENSOR_V_RESMPL_PROJ, "v.resmpl.proj" }, { LLM_TENSOR_V_RESMPL_QUERY, "v.resmpl.query" }, + { LLM_TENSOR_V_TOK_EMBD_IMAGE, "v.tok_embd.image" }, + { LLM_TENSOR_V_TOK_EMBD_END_IMAGE, "v.tok_embd.end_image" }, + { LLM_TENSOR_V_TOK_EMBD_SLICE, "v.tok_embd.slice" }, + { LLM_TENSOR_V_TOK_EMBD_END_SLICE, "v.tok_embd.end_slice" }, } }, { diff --git a/src/llama-arch.h b/src/llama-arch.h index 4f3e76a5f..3440ded53 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -381,6 +381,10 @@ enum llm_tensor { LLM_TENSOR_V_RESMPL_Q_NORM, LLM_TENSOR_V_RESMPL_PROJ, LLM_TENSOR_V_RESMPL_QUERY, + LLM_TENSOR_V_TOK_EMBD_IMAGE, + LLM_TENSOR_V_TOK_EMBD_END_IMAGE, + LLM_TENSOR_V_TOK_EMBD_SLICE, + LLM_TENSOR_V_TOK_EMBD_END_SLICE, }; enum llm_tensor_layer { diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 1bebc7988..4aed37d89 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -3549,6 +3549,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) { vit.mm_model_ln_post_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_POST_NORM, "weight"), {rs_n_embd}); vit.mm_model_ln_post_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_POST_NORM, "bias" ), {rs_n_embd}); + // tok embd + vit.mm_tok_embd_image = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_TOK_EMBD_IMAGE, "weight"), {n_embd}); + vit.mm_tok_embd_end_image = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_TOK_EMBD_END_IMAGE, "weight"), {n_embd}); + vit.mm_tok_embd_slice = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_TOK_EMBD_SLICE, "weight"), {n_embd}); + vit.mm_tok_embd_end_slice = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_TOK_EMBD_END_SLICE, "weight"), {n_embd}); + for (int i = 0; i < n_vlayer; ++i) { auto & layer = vit.layers[i]; diff --git a/src/llama-vision.cpp b/src/llama-vision.cpp index d4471cd2e..ca65d536b 100644 --- a/src/llama-vision.cpp +++ b/src/llama-vision.cpp @@ -895,6 +895,10 @@ struct llama_vision_graph_builder { cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur); } + // add and token embeddings + cur = ggml_concat(ctx0, model.mm_tok_embd_image, cur, 1); + cur = ggml_concat(ctx0, cur, model.mm_tok_embd_end_image, 1); + ggml_set_name(cur, "output"); ggml_build_forward_expand(gf, cur); diff --git a/src/llama-vision.h b/src/llama-vision.h index 45cb75944..948c8d0ed 100644 --- a/src/llama-vision.h +++ b/src/llama-vision.h @@ -129,7 +129,11 @@ struct llama_vision_model { struct ggml_tensor * mm_model_ln_post_w = nullptr; struct ggml_tensor * mm_model_ln_post_b = nullptr; - struct ggml_tensor * image_newline = nullptr; + // special tokens + struct ggml_tensor * mm_tok_embd_image = nullptr; + struct ggml_tensor * mm_tok_embd_end_image = nullptr; + struct ggml_tensor * mm_tok_embd_slice = nullptr; + struct ggml_tensor * mm_tok_embd_end_slice = nullptr; }; struct llama_vision_context {