From 431bb0805919ef74cbae8ca918301c468c642380 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 21 Jan 2025 10:51:26 +0100 Subject: [PATCH] change gguf KV from clip to vit --- convert_hf_to_gguf.py | 36 ++++++++++++------------- gguf-py/gguf/constants.py | 34 ++++++++++++------------ gguf-py/gguf/gguf_writer.py | 52 ++++++++++++++++++------------------- src/llama-arch.cpp | 30 ++++++++++----------- src/llama-arch.h | 30 ++++++++++----------- src/llama-model.cpp | 24 ++++++++--------- 6 files changed, 103 insertions(+), 103 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index bf6ffb49c..d32272ac2 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -281,17 +281,17 @@ class Model: # Vision model parameters if self.vparams is not None and self.preprocessor_config is not None and self.vision_arch is not None: - self.gguf_writer.add_vision_type("clip-vit") + self.gguf_writer.add_vision_type("vit") self.gguf_writer.add_vision_image_size(self.vparams["image_size"]) self.gguf_writer.add_vision_patch_size(self.vparams["patch_size"]) - self.gguf_writer.add_vision_clip_architecture(gguf.MODEL_ARCH_NAMES[self.vision_arch]) - self.gguf_writer.add_vision_clip_block_count(self.vparams["num_hidden_layers"]) - self.gguf_writer.add_vision_clip_embedding_length(self.vparams["hidden_size"]) - self.gguf_writer.add_vision_clip_feed_forward_length(self.vparams["intermediate_size"]) - self.gguf_writer.add_vision_clip_head_count(self.vparams["num_attention_heads"]) - self.gguf_writer.add_vision_clip_image_mean(self.preprocessor_config["image_mean"]) - self.gguf_writer.add_vision_clip_image_std(self.preprocessor_config["image_std"]) - self.gguf_writer.add_vision_clip_select_layer(self.find_hparam(["vision_feature_layer", "mm_vision_select_layer"])) + self.gguf_writer.add_vision_vit_architecture(gguf.MODEL_ARCH_NAMES[self.vision_arch]) + self.gguf_writer.add_vision_vit_block_count(self.vparams["num_hidden_layers"]) + self.gguf_writer.add_vision_vit_embedding_length(self.vparams["hidden_size"]) + self.gguf_writer.add_vision_vit_feed_forward_length(self.vparams["intermediate_size"]) + self.gguf_writer.add_vision_vit_head_count(self.vparams["num_attention_heads"]) + self.gguf_writer.add_vision_vit_image_mean(self.preprocessor_config["image_mean"]) + self.gguf_writer.add_vision_vit_image_std(self.preprocessor_config["image_std"]) + self.gguf_writer.add_vision_vit_select_layer(self.find_hparam(["vision_feature_layer", "mm_vision_select_layer"])) self.gguf_writer.add_file_type(self.ftype) logger.info(f"gguf: file type = {self.ftype}") @@ -1690,15 +1690,15 @@ class LlamaModel(Model): # For vision model if self.vparams is not None: - self.gguf_writer.add_vision_clip_patch_merge_type(gguf.CLIPPatchMergeType.FLAT) + self.gguf_writer.add_vision_vit_patch_merge_type(gguf.CLIPPatchMergeType.FLAT) # TODO: should not hardcode these, but they are currently missing from config.json if self.vision_arch == gguf.MODEL_ARCH.VISION_LLAVA: - self.gguf_writer.add_vision_clip_projector_type(gguf.constants.CLIPProjectorType.MLP) + self.gguf_writer.add_vision_vit_projector_type(gguf.constants.CLIPProjectorType.MLP) if self.vision_arch == gguf.MODEL_ARCH.VISION_MOBILEVLM: - self.gguf_writer.add_vision_clip_projector_type(gguf.constants.CLIPProjectorType.LDPV2) - self.gguf_writer.add_vision_clip_layer_norm_epsilon(1e-05) + self.gguf_writer.add_vision_vit_projector_type(gguf.constants.CLIPProjectorType.LDPV2) + self.gguf_writer.add_vision_vit_layer_norm_epsilon(1e-05) max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2 + 1 - self.gguf_writer.add_vision_clip_max_position_embeddings(max_pos_embd) + self.gguf_writer.add_vision_vit_max_position_embeddings(max_pos_embd) @staticmethod def permute(weights: Tensor, n_head: int, n_head_kv: int | None): @@ -2193,11 +2193,11 @@ class MiniCPMModel(Model): # For vision model if self.vparams is not None and self.proj_type is not None: - self.gguf_writer.add_vision_clip_patch_merge_type(gguf.CLIPPatchMergeType.FLAT) - self.gguf_writer.add_vision_clip_projector_type(self.proj_type) - self.gguf_writer.add_vision_clip_layer_norm_epsilon(1e-06) + self.gguf_writer.add_vision_vit_patch_merge_type(gguf.CLIPPatchMergeType.FLAT) + self.gguf_writer.add_vision_vit_projector_type(self.proj_type) + self.gguf_writer.add_vision_vit_layer_norm_epsilon(1e-06) max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2 - self.gguf_writer.add_vision_clip_max_position_embeddings(max_pos_embd) + self.gguf_writer.add_vision_vit_max_position_embeddings(max_pos_embd) def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index bd7befed2..601016eda 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -215,29 +215,29 @@ class Keys: LORA_ALPHA = "adapter.lora.alpha" class Vision: - # only support vision.type = "clip-vit" for now + # only support vision.type = "vit" for now TYPE = "vision.type" IMAGE_SIZE = "vision.image_size" PATCH_SIZE = "vision.patch_size" IMAGE_MEAN = "vision.image_mean" IMAGE_STD = "vision.image_std" - class Clip: - ARCHITECTURE = "vision.clip.architecture" - CONTEXT_LENGTH = "vision.clip.context_length" - EMBEDDING_LENGTH = "vision.clip.embedding_length" - BLOCK_COUNT = "vision.clip.block_count" - FEED_FORWARD_LENGTH = "vision.clip.feed_forward_length" - PROJECTION_TYPE = "vision.clip.projection_type" - PROJECTION_DIM = "vision.clip.projection_dim" - USE_GELU = "vision.clip.use_gelu" - MAX_POS_EMBEDDING = "vision.clip.max_position_embeddings" - MAX_SLICES = "vision.clip.max_slices" - PROJECTOR_TYPE = "vision.clip.projector_type" - SELECT_LAYER = "vision.clip.select_layer" - PATCH_MERGE_TYPE = "vision.clip.patch_merge_type" - HEAD_COUNT = "vision.clip.attention.head_count" - LAYERNORM_EPS = "vision.clip.attention.layer_norm_epsilon" + class Vit: + ARCHITECTURE = "vision.vit.architecture" + CONTEXT_LENGTH = "vision.vit.context_length" + EMBEDDING_LENGTH = "vision.vit.embedding_length" + BLOCK_COUNT = "vision.vit.block_count" + FEED_FORWARD_LENGTH = "vision.vit.feed_forward_length" + PROJECTION_TYPE = "vision.vit.projection_type" + PROJECTION_DIM = "vision.vit.projection_dim" + USE_GELU = "vision.vit.use_gelu" + MAX_POS_EMBEDDING = "vision.vit.max_position_embeddings" + MAX_SLICES = "vision.vit.max_slices" + PROJECTOR_TYPE = "vision.vit.projector_type" + SELECT_LAYER = "vision.vit.select_layer" + PATCH_MERGE_TYPE = "vision.vit.patch_merge_type" + HEAD_COUNT = "vision.vit.attention.head_count" + LAYERNORM_EPS = "vision.vit.attention.layer_norm_epsilon" # # recommended mapping of model tensor names for storage in gguf diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 4b9a0c966..65d0e8f30 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -886,46 +886,46 @@ class GGUFWriter: def add_vision_patch_size(self, value: int) -> None: self.add_uint32(Keys.Vision.PATCH_SIZE, value) - def add_vision_clip_architecture(self, value: str) -> None: - self.add_string(Keys.Vision.Clip.ARCHITECTURE, value) + def add_vision_vit_architecture(self, value: str) -> None: + self.add_string(Keys.Vision.Vit.ARCHITECTURE, value) - def add_vision_clip_context_length(self, value: int) -> None: - self.add_uint32(Keys.Vision.Clip.CONTEXT_LENGTH, value) + def add_vision_vit_context_length(self, value: int) -> None: + self.add_uint32(Keys.Vision.Vit.CONTEXT_LENGTH, value) - def add_vision_clip_embedding_length(self, value: int) -> None: - self.add_uint32(Keys.Vision.Clip.EMBEDDING_LENGTH, value) + def add_vision_vit_embedding_length(self, value: int) -> None: + self.add_uint32(Keys.Vision.Vit.EMBEDDING_LENGTH, value) - def add_vision_clip_block_count(self, value: int) -> None: - self.add_uint32(Keys.Vision.Clip.BLOCK_COUNT, value) + def add_vision_vit_block_count(self, value: int) -> None: + self.add_uint32(Keys.Vision.Vit.BLOCK_COUNT, value) - def add_vision_clip_feed_forward_length(self, value: int) -> None: - self.add_uint32(Keys.Vision.Clip.FEED_FORWARD_LENGTH, value) + def add_vision_vit_feed_forward_length(self, value: int) -> None: + self.add_uint32(Keys.Vision.Vit.FEED_FORWARD_LENGTH, value) - def add_vision_clip_head_count(self, value: int) -> None: - self.add_uint32(Keys.Vision.Clip.HEAD_COUNT, value) + def add_vision_vit_head_count(self, value: int) -> None: + self.add_uint32(Keys.Vision.Vit.HEAD_COUNT, value) - def add_vision_clip_max_position_embeddings(self, value: int) -> None: - self.add_uint32(Keys.Vision.Clip.MAX_POS_EMBEDDING, value) + def add_vision_vit_max_position_embeddings(self, value: int) -> None: + self.add_uint32(Keys.Vision.Vit.MAX_POS_EMBEDDING, value) - def add_vision_clip_projector_type(self, value: CLIPProjectorType) -> None: - self.add_string(Keys.Vision.Clip.PROJECTOR_TYPE, value.value) + def add_vision_vit_projector_type(self, value: CLIPProjectorType) -> None: + self.add_string(Keys.Vision.Vit.PROJECTOR_TYPE, value.value) - def add_vision_clip_max_slices(self, value: int) -> None: - self.add_uint32(Keys.Vision.Clip.MAX_SLICES, value) + def add_vision_vit_max_slices(self, value: int) -> None: + self.add_uint32(Keys.Vision.Vit.MAX_SLICES, value) - def add_vision_clip_select_layer(self, value: int) -> None: - self.add_int32(Keys.Vision.Clip.SELECT_LAYER, value) + def add_vision_vit_select_layer(self, value: int) -> None: + self.add_int32(Keys.Vision.Vit.SELECT_LAYER, value) - def add_vision_clip_patch_merge_type(self, value: CLIPPatchMergeType) -> None: - self.add_string(Keys.Vision.Clip.PATCH_MERGE_TYPE, value.value) + def add_vision_vit_patch_merge_type(self, value: CLIPPatchMergeType) -> None: + self.add_string(Keys.Vision.Vit.PATCH_MERGE_TYPE, value.value) - def add_vision_clip_layer_norm_epsilon(self, value: float) -> None: - self.add_float32(Keys.Vision.Clip.LAYERNORM_EPS, value) + def add_vision_vit_layer_norm_epsilon(self, value: float) -> None: + self.add_float32(Keys.Vision.Vit.LAYERNORM_EPS, value) - def add_vision_clip_image_mean(self, value: Sequence[float]) -> None: + def add_vision_vit_image_mean(self, value: Sequence[float]) -> None: self.add_array(Keys.Vision.IMAGE_MEAN, value) - def add_vision_clip_image_std(self, value: Sequence[float]) -> None: + def add_vision_vit_image_std(self, value: Sequence[float]) -> None: self.add_array(Keys.Vision.IMAGE_STD, value) def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None: diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index e2908c0ae..48336943c 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -195,21 +195,21 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_VISION_PATCH_SIZE, "vision.patch_size" }, { LLM_KV_VISION_IMAGE_MEAN, "vision.image_mean" }, { LLM_KV_VISION_IMAGE_STD, "vision.image_std" }, - { LLM_KV_VISION_CLIP_ARCHITECTURE, "vision.clip.architecture" }, - { LLM_KV_VISION_CLIP_CONTEXT_LENGTH, "vision.clip.context_length" }, - { LLM_KV_VISION_CLIP_EMBEDDING_LENGTH, "vision.clip.embedding_length" }, - { LLM_KV_VISION_CLIP_BLOCK_COUNT, "vision.clip.block_count" }, - { LLM_KV_VISION_CLIP_FEED_FORWARD_LENGTH, "vision.clip.feed_forward_length" }, - { LLM_KV_VISION_CLIP_PROJECTION_TYPE, "vision.clip.projection_type" }, - { LLM_KV_VISION_CLIP_PROJECTION_DIM, "vision.clip.projection_dim" }, - { LLM_KV_VISION_CLIP_USE_GELU, "vision.clip.use_gelu" }, - { LLM_KV_VISION_CLIP_MAX_POS_EMBD, "vision.clip.max_position_embeddings" }, - { LLM_KV_VISION_CLIP_MAX_SLICES, "vision.clip.max_slices" }, - { LLM_KV_VISION_CLIP_PROJECTOR_TYPE, "vision.clip.projector_type" }, - { LLM_KV_VISION_CLIP_SELECT_LAYER, "vision.clip.select_layer" }, - { LLM_KV_VISION_CLIP_PATCH_MERGE_TYPE, "vision.clip.patch_merge_type" }, - { LLM_KV_VISION_CLIP_HEAD_COUNT, "vision.clip.attention.head_count" }, - { LLM_KV_VISION_CLIP_LAYERNORM_EPS, "vision.clip.attention.layer_norm_epsilon" }, + { LLM_KV_VISION_VIT_ARCHITECTURE, "vision.vit.architecture" }, + { LLM_KV_VISION_VIT_CONTEXT_LENGTH, "vision.vit.context_length" }, + { LLM_KV_VISION_VIT_EMBEDDING_LENGTH, "vision.vit.embedding_length" }, + { LLM_KV_VISION_VIT_BLOCK_COUNT, "vision.vit.block_count" }, + { LLM_KV_VISION_VIT_FEED_FORWARD_LENGTH, "vision.vit.feed_forward_length" }, + { LLM_KV_VISION_VIT_PROJECTION_TYPE, "vision.vit.projection_type" }, + { LLM_KV_VISION_VIT_PROJECTION_DIM, "vision.vit.projection_dim" }, + { LLM_KV_VISION_VIT_USE_GELU, "vision.vit.use_gelu" }, + { LLM_KV_VISION_VIT_MAX_POS_EMBD, "vision.vit.max_position_embeddings" }, + { LLM_KV_VISION_VIT_MAX_SLICES, "vision.vit.max_slices" }, + { LLM_KV_VISION_VIT_PROJECTOR_TYPE, "vision.vit.projector_type" }, + { LLM_KV_VISION_VIT_SELECT_LAYER, "vision.vit.select_layer" }, + { LLM_KV_VISION_VIT_PATCH_MERGE_TYPE, "vision.vit.patch_merge_type" }, + { LLM_KV_VISION_VIT_HEAD_COUNT, "vision.vit.attention.head_count" }, + { LLM_KV_VISION_VIT_LAYERNORM_EPS, "vision.vit.attention.layer_norm_epsilon" }, // deprecated { LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index 7d4a1cd8c..5629dc46d 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -205,21 +205,21 @@ enum llm_kv { LLM_KV_VISION_PATCH_SIZE, LLM_KV_VISION_IMAGE_MEAN, LLM_KV_VISION_IMAGE_STD, - LLM_KV_VISION_CLIP_ARCHITECTURE, - LLM_KV_VISION_CLIP_CONTEXT_LENGTH, - LLM_KV_VISION_CLIP_EMBEDDING_LENGTH, - LLM_KV_VISION_CLIP_BLOCK_COUNT, - LLM_KV_VISION_CLIP_FEED_FORWARD_LENGTH, - LLM_KV_VISION_CLIP_PROJECTION_TYPE, - LLM_KV_VISION_CLIP_PROJECTION_DIM, - LLM_KV_VISION_CLIP_USE_GELU, - LLM_KV_VISION_CLIP_MAX_POS_EMBD, - LLM_KV_VISION_CLIP_MAX_SLICES, - LLM_KV_VISION_CLIP_PROJECTOR_TYPE, - LLM_KV_VISION_CLIP_SELECT_LAYER, - LLM_KV_VISION_CLIP_PATCH_MERGE_TYPE, - LLM_KV_VISION_CLIP_HEAD_COUNT, - LLM_KV_VISION_CLIP_LAYERNORM_EPS, + LLM_KV_VISION_VIT_ARCHITECTURE, + LLM_KV_VISION_VIT_CONTEXT_LENGTH, + LLM_KV_VISION_VIT_EMBEDDING_LENGTH, + LLM_KV_VISION_VIT_BLOCK_COUNT, + LLM_KV_VISION_VIT_FEED_FORWARD_LENGTH, + LLM_KV_VISION_VIT_PROJECTION_TYPE, + LLM_KV_VISION_VIT_PROJECTION_DIM, + LLM_KV_VISION_VIT_USE_GELU, + LLM_KV_VISION_VIT_MAX_POS_EMBD, + LLM_KV_VISION_VIT_MAX_SLICES, + LLM_KV_VISION_VIT_PROJECTOR_TYPE, + LLM_KV_VISION_VIT_SELECT_LAYER, + LLM_KV_VISION_VIT_PATCH_MERGE_TYPE, + LLM_KV_VISION_VIT_HEAD_COUNT, + LLM_KV_VISION_VIT_LAYERNORM_EPS, // deprecated: LLM_KV_TOKENIZER_PREFIX_ID, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index d4d53aba6..a305fa463 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1251,23 +1251,23 @@ void llama_model::load_hparams(llama_model_loader & ml) { auto & vparams = clip.hparams; std::string vision_type; ml.get_key(LLM_KV_VISION_TYPE, vision_type, false); - if (vision_type == "clip-vit") { - LLAMA_LOG_INFO("%s: loading clip-vit vision model\n", __func__); + if (vision_type == "vit") { + LLAMA_LOG_INFO("%s: loading ViT vision model\n", __func__); has_vision = true; ml.get_key(LLM_KV_VISION_IMAGE_SIZE, vparams.image_size, true); ml.get_key(LLM_KV_VISION_PATCH_SIZE, vparams.patch_size, true); ml.get_key_or_arr(LLM_KV_VISION_IMAGE_MEAN, vparams.image_mean, 3, true); ml.get_key_or_arr(LLM_KV_VISION_IMAGE_STD, vparams.image_std, 3, true); - ml.get_key(LLM_KV_VISION_CLIP_EMBEDDING_LENGTH, vparams.hidden_size, true); - ml.get_key(LLM_KV_VISION_CLIP_BLOCK_COUNT, vparams.n_layer, true); - ml.get_key(LLM_KV_VISION_CLIP_FEED_FORWARD_LENGTH, vparams.n_intermediate, true); - ml.get_key(LLM_KV_VISION_CLIP_HEAD_COUNT, vparams.n_head, true); - ml.get_key(LLM_KV_VISION_CLIP_LAYERNORM_EPS, vparams.eps, true); - ml.get_key(LLM_KV_VISION_CLIP_SELECT_LAYER, vparams.select_layer, true); - ml.get_key(LLM_KV_VISION_CLIP_MAX_POS_EMBD, vparams.max_pos_embd, true); + ml.get_key(LLM_KV_VISION_VIT_EMBEDDING_LENGTH, vparams.hidden_size, true); + ml.get_key(LLM_KV_VISION_VIT_BLOCK_COUNT, vparams.n_layer, true); + ml.get_key(LLM_KV_VISION_VIT_FEED_FORWARD_LENGTH, vparams.n_intermediate, true); + ml.get_key(LLM_KV_VISION_VIT_HEAD_COUNT, vparams.n_head, true); + ml.get_key(LLM_KV_VISION_VIT_LAYERNORM_EPS, vparams.eps, true); + ml.get_key(LLM_KV_VISION_VIT_SELECT_LAYER, vparams.select_layer, true); + ml.get_key(LLM_KV_VISION_VIT_MAX_POS_EMBD, vparams.max_pos_embd, true); { std::string name; - ml.get_key(LLM_KV_VISION_CLIP_PROJECTOR_TYPE, name, true); + ml.get_key(LLM_KV_VISION_VIT_PROJECTOR_TYPE, name, true); vparams.proj_type = clip_projector_type_from_name(name); if (vparams.proj_type == CLIP_PROJECTOR_TYPE_UNKNOWN) { throw std::runtime_error(format("unsupported clip projector type: %s", name.c_str())); @@ -1275,12 +1275,12 @@ void llama_model::load_hparams(llama_model_loader & ml) { } { std::string name; - ml.get_key(LLM_KV_VISION_CLIP_PATCH_MERGE_TYPE, name, false); + ml.get_key(LLM_KV_VISION_VIT_PATCH_MERGE_TYPE, name, false); vparams.mm_patch_merge_type = mm_patch_merge_from_name(name); } { std::string arch; - ml.get_key(LLM_KV_VISION_CLIP_ARCHITECTURE, arch, true); + ml.get_key(LLM_KV_VISION_VIT_ARCHITECTURE, arch, true); vparams.arch = vision_arch_from_string(arch); if (vparams.arch == VISION_ARCH_UNKNOWN) { throw std::runtime_error(format("unsupported vision arch: %s", arch.c_str()));