mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-02-05 16:10:42 +01:00
change gguf KV from clip to vit
This commit is contained in:
parent
4a7ab89d75
commit
431bb08059
@ -281,17 +281,17 @@ class Model:
|
|||||||
|
|
||||||
# Vision model parameters
|
# Vision model parameters
|
||||||
if self.vparams is not None and self.preprocessor_config is not None and self.vision_arch is not None:
|
if self.vparams is not None and self.preprocessor_config is not None and self.vision_arch is not None:
|
||||||
self.gguf_writer.add_vision_type("clip-vit")
|
self.gguf_writer.add_vision_type("vit")
|
||||||
self.gguf_writer.add_vision_image_size(self.vparams["image_size"])
|
self.gguf_writer.add_vision_image_size(self.vparams["image_size"])
|
||||||
self.gguf_writer.add_vision_patch_size(self.vparams["patch_size"])
|
self.gguf_writer.add_vision_patch_size(self.vparams["patch_size"])
|
||||||
self.gguf_writer.add_vision_clip_architecture(gguf.MODEL_ARCH_NAMES[self.vision_arch])
|
self.gguf_writer.add_vision_vit_architecture(gguf.MODEL_ARCH_NAMES[self.vision_arch])
|
||||||
self.gguf_writer.add_vision_clip_block_count(self.vparams["num_hidden_layers"])
|
self.gguf_writer.add_vision_vit_block_count(self.vparams["num_hidden_layers"])
|
||||||
self.gguf_writer.add_vision_clip_embedding_length(self.vparams["hidden_size"])
|
self.gguf_writer.add_vision_vit_embedding_length(self.vparams["hidden_size"])
|
||||||
self.gguf_writer.add_vision_clip_feed_forward_length(self.vparams["intermediate_size"])
|
self.gguf_writer.add_vision_vit_feed_forward_length(self.vparams["intermediate_size"])
|
||||||
self.gguf_writer.add_vision_clip_head_count(self.vparams["num_attention_heads"])
|
self.gguf_writer.add_vision_vit_head_count(self.vparams["num_attention_heads"])
|
||||||
self.gguf_writer.add_vision_clip_image_mean(self.preprocessor_config["image_mean"])
|
self.gguf_writer.add_vision_vit_image_mean(self.preprocessor_config["image_mean"])
|
||||||
self.gguf_writer.add_vision_clip_image_std(self.preprocessor_config["image_std"])
|
self.gguf_writer.add_vision_vit_image_std(self.preprocessor_config["image_std"])
|
||||||
self.gguf_writer.add_vision_clip_select_layer(self.find_hparam(["vision_feature_layer", "mm_vision_select_layer"]))
|
self.gguf_writer.add_vision_vit_select_layer(self.find_hparam(["vision_feature_layer", "mm_vision_select_layer"]))
|
||||||
|
|
||||||
self.gguf_writer.add_file_type(self.ftype)
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
logger.info(f"gguf: file type = {self.ftype}")
|
logger.info(f"gguf: file type = {self.ftype}")
|
||||||
@ -1690,15 +1690,15 @@ class LlamaModel(Model):
|
|||||||
|
|
||||||
# For vision model
|
# For vision model
|
||||||
if self.vparams is not None:
|
if self.vparams is not None:
|
||||||
self.gguf_writer.add_vision_clip_patch_merge_type(gguf.CLIPPatchMergeType.FLAT)
|
self.gguf_writer.add_vision_vit_patch_merge_type(gguf.CLIPPatchMergeType.FLAT)
|
||||||
# TODO: should not hardcode these, but they are currently missing from config.json
|
# TODO: should not hardcode these, but they are currently missing from config.json
|
||||||
if self.vision_arch == gguf.MODEL_ARCH.VISION_LLAVA:
|
if self.vision_arch == gguf.MODEL_ARCH.VISION_LLAVA:
|
||||||
self.gguf_writer.add_vision_clip_projector_type(gguf.constants.CLIPProjectorType.MLP)
|
self.gguf_writer.add_vision_vit_projector_type(gguf.constants.CLIPProjectorType.MLP)
|
||||||
if self.vision_arch == gguf.MODEL_ARCH.VISION_MOBILEVLM:
|
if self.vision_arch == gguf.MODEL_ARCH.VISION_MOBILEVLM:
|
||||||
self.gguf_writer.add_vision_clip_projector_type(gguf.constants.CLIPProjectorType.LDPV2)
|
self.gguf_writer.add_vision_vit_projector_type(gguf.constants.CLIPProjectorType.LDPV2)
|
||||||
self.gguf_writer.add_vision_clip_layer_norm_epsilon(1e-05)
|
self.gguf_writer.add_vision_vit_layer_norm_epsilon(1e-05)
|
||||||
max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2 + 1
|
max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2 + 1
|
||||||
self.gguf_writer.add_vision_clip_max_position_embeddings(max_pos_embd)
|
self.gguf_writer.add_vision_vit_max_position_embeddings(max_pos_embd)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
||||||
@ -2193,11 +2193,11 @@ class MiniCPMModel(Model):
|
|||||||
|
|
||||||
# For vision model
|
# For vision model
|
||||||
if self.vparams is not None and self.proj_type is not None:
|
if self.vparams is not None and self.proj_type is not None:
|
||||||
self.gguf_writer.add_vision_clip_patch_merge_type(gguf.CLIPPatchMergeType.FLAT)
|
self.gguf_writer.add_vision_vit_patch_merge_type(gguf.CLIPPatchMergeType.FLAT)
|
||||||
self.gguf_writer.add_vision_clip_projector_type(self.proj_type)
|
self.gguf_writer.add_vision_vit_projector_type(self.proj_type)
|
||||||
self.gguf_writer.add_vision_clip_layer_norm_epsilon(1e-06)
|
self.gguf_writer.add_vision_vit_layer_norm_epsilon(1e-06)
|
||||||
max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2
|
max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2
|
||||||
self.gguf_writer.add_vision_clip_max_position_embeddings(max_pos_embd)
|
self.gguf_writer.add_vision_vit_max_position_embeddings(max_pos_embd)
|
||||||
|
|
||||||
|
|
||||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
@ -215,29 +215,29 @@ class Keys:
|
|||||||
LORA_ALPHA = "adapter.lora.alpha"
|
LORA_ALPHA = "adapter.lora.alpha"
|
||||||
|
|
||||||
class Vision:
|
class Vision:
|
||||||
# only support vision.type = "clip-vit" for now
|
# only support vision.type = "vit" for now
|
||||||
TYPE = "vision.type"
|
TYPE = "vision.type"
|
||||||
IMAGE_SIZE = "vision.image_size"
|
IMAGE_SIZE = "vision.image_size"
|
||||||
PATCH_SIZE = "vision.patch_size"
|
PATCH_SIZE = "vision.patch_size"
|
||||||
IMAGE_MEAN = "vision.image_mean"
|
IMAGE_MEAN = "vision.image_mean"
|
||||||
IMAGE_STD = "vision.image_std"
|
IMAGE_STD = "vision.image_std"
|
||||||
|
|
||||||
class Clip:
|
class Vit:
|
||||||
ARCHITECTURE = "vision.clip.architecture"
|
ARCHITECTURE = "vision.vit.architecture"
|
||||||
CONTEXT_LENGTH = "vision.clip.context_length"
|
CONTEXT_LENGTH = "vision.vit.context_length"
|
||||||
EMBEDDING_LENGTH = "vision.clip.embedding_length"
|
EMBEDDING_LENGTH = "vision.vit.embedding_length"
|
||||||
BLOCK_COUNT = "vision.clip.block_count"
|
BLOCK_COUNT = "vision.vit.block_count"
|
||||||
FEED_FORWARD_LENGTH = "vision.clip.feed_forward_length"
|
FEED_FORWARD_LENGTH = "vision.vit.feed_forward_length"
|
||||||
PROJECTION_TYPE = "vision.clip.projection_type"
|
PROJECTION_TYPE = "vision.vit.projection_type"
|
||||||
PROJECTION_DIM = "vision.clip.projection_dim"
|
PROJECTION_DIM = "vision.vit.projection_dim"
|
||||||
USE_GELU = "vision.clip.use_gelu"
|
USE_GELU = "vision.vit.use_gelu"
|
||||||
MAX_POS_EMBEDDING = "vision.clip.max_position_embeddings"
|
MAX_POS_EMBEDDING = "vision.vit.max_position_embeddings"
|
||||||
MAX_SLICES = "vision.clip.max_slices"
|
MAX_SLICES = "vision.vit.max_slices"
|
||||||
PROJECTOR_TYPE = "vision.clip.projector_type"
|
PROJECTOR_TYPE = "vision.vit.projector_type"
|
||||||
SELECT_LAYER = "vision.clip.select_layer"
|
SELECT_LAYER = "vision.vit.select_layer"
|
||||||
PATCH_MERGE_TYPE = "vision.clip.patch_merge_type"
|
PATCH_MERGE_TYPE = "vision.vit.patch_merge_type"
|
||||||
HEAD_COUNT = "vision.clip.attention.head_count"
|
HEAD_COUNT = "vision.vit.attention.head_count"
|
||||||
LAYERNORM_EPS = "vision.clip.attention.layer_norm_epsilon"
|
LAYERNORM_EPS = "vision.vit.attention.layer_norm_epsilon"
|
||||||
|
|
||||||
#
|
#
|
||||||
# recommended mapping of model tensor names for storage in gguf
|
# recommended mapping of model tensor names for storage in gguf
|
||||||
|
@ -886,46 +886,46 @@ class GGUFWriter:
|
|||||||
def add_vision_patch_size(self, value: int) -> None:
|
def add_vision_patch_size(self, value: int) -> None:
|
||||||
self.add_uint32(Keys.Vision.PATCH_SIZE, value)
|
self.add_uint32(Keys.Vision.PATCH_SIZE, value)
|
||||||
|
|
||||||
def add_vision_clip_architecture(self, value: str) -> None:
|
def add_vision_vit_architecture(self, value: str) -> None:
|
||||||
self.add_string(Keys.Vision.Clip.ARCHITECTURE, value)
|
self.add_string(Keys.Vision.Vit.ARCHITECTURE, value)
|
||||||
|
|
||||||
def add_vision_clip_context_length(self, value: int) -> None:
|
def add_vision_vit_context_length(self, value: int) -> None:
|
||||||
self.add_uint32(Keys.Vision.Clip.CONTEXT_LENGTH, value)
|
self.add_uint32(Keys.Vision.Vit.CONTEXT_LENGTH, value)
|
||||||
|
|
||||||
def add_vision_clip_embedding_length(self, value: int) -> None:
|
def add_vision_vit_embedding_length(self, value: int) -> None:
|
||||||
self.add_uint32(Keys.Vision.Clip.EMBEDDING_LENGTH, value)
|
self.add_uint32(Keys.Vision.Vit.EMBEDDING_LENGTH, value)
|
||||||
|
|
||||||
def add_vision_clip_block_count(self, value: int) -> None:
|
def add_vision_vit_block_count(self, value: int) -> None:
|
||||||
self.add_uint32(Keys.Vision.Clip.BLOCK_COUNT, value)
|
self.add_uint32(Keys.Vision.Vit.BLOCK_COUNT, value)
|
||||||
|
|
||||||
def add_vision_clip_feed_forward_length(self, value: int) -> None:
|
def add_vision_vit_feed_forward_length(self, value: int) -> None:
|
||||||
self.add_uint32(Keys.Vision.Clip.FEED_FORWARD_LENGTH, value)
|
self.add_uint32(Keys.Vision.Vit.FEED_FORWARD_LENGTH, value)
|
||||||
|
|
||||||
def add_vision_clip_head_count(self, value: int) -> None:
|
def add_vision_vit_head_count(self, value: int) -> None:
|
||||||
self.add_uint32(Keys.Vision.Clip.HEAD_COUNT, value)
|
self.add_uint32(Keys.Vision.Vit.HEAD_COUNT, value)
|
||||||
|
|
||||||
def add_vision_clip_max_position_embeddings(self, value: int) -> None:
|
def add_vision_vit_max_position_embeddings(self, value: int) -> None:
|
||||||
self.add_uint32(Keys.Vision.Clip.MAX_POS_EMBEDDING, value)
|
self.add_uint32(Keys.Vision.Vit.MAX_POS_EMBEDDING, value)
|
||||||
|
|
||||||
def add_vision_clip_projector_type(self, value: CLIPProjectorType) -> None:
|
def add_vision_vit_projector_type(self, value: CLIPProjectorType) -> None:
|
||||||
self.add_string(Keys.Vision.Clip.PROJECTOR_TYPE, value.value)
|
self.add_string(Keys.Vision.Vit.PROJECTOR_TYPE, value.value)
|
||||||
|
|
||||||
def add_vision_clip_max_slices(self, value: int) -> None:
|
def add_vision_vit_max_slices(self, value: int) -> None:
|
||||||
self.add_uint32(Keys.Vision.Clip.MAX_SLICES, value)
|
self.add_uint32(Keys.Vision.Vit.MAX_SLICES, value)
|
||||||
|
|
||||||
def add_vision_clip_select_layer(self, value: int) -> None:
|
def add_vision_vit_select_layer(self, value: int) -> None:
|
||||||
self.add_int32(Keys.Vision.Clip.SELECT_LAYER, value)
|
self.add_int32(Keys.Vision.Vit.SELECT_LAYER, value)
|
||||||
|
|
||||||
def add_vision_clip_patch_merge_type(self, value: CLIPPatchMergeType) -> None:
|
def add_vision_vit_patch_merge_type(self, value: CLIPPatchMergeType) -> None:
|
||||||
self.add_string(Keys.Vision.Clip.PATCH_MERGE_TYPE, value.value)
|
self.add_string(Keys.Vision.Vit.PATCH_MERGE_TYPE, value.value)
|
||||||
|
|
||||||
def add_vision_clip_layer_norm_epsilon(self, value: float) -> None:
|
def add_vision_vit_layer_norm_epsilon(self, value: float) -> None:
|
||||||
self.add_float32(Keys.Vision.Clip.LAYERNORM_EPS, value)
|
self.add_float32(Keys.Vision.Vit.LAYERNORM_EPS, value)
|
||||||
|
|
||||||
def add_vision_clip_image_mean(self, value: Sequence[float]) -> None:
|
def add_vision_vit_image_mean(self, value: Sequence[float]) -> None:
|
||||||
self.add_array(Keys.Vision.IMAGE_MEAN, value)
|
self.add_array(Keys.Vision.IMAGE_MEAN, value)
|
||||||
|
|
||||||
def add_vision_clip_image_std(self, value: Sequence[float]) -> None:
|
def add_vision_vit_image_std(self, value: Sequence[float]) -> None:
|
||||||
self.add_array(Keys.Vision.IMAGE_STD, value)
|
self.add_array(Keys.Vision.IMAGE_STD, value)
|
||||||
|
|
||||||
def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
|
def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
|
||||||
|
@ -195,21 +195,21 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|||||||
{ LLM_KV_VISION_PATCH_SIZE, "vision.patch_size" },
|
{ LLM_KV_VISION_PATCH_SIZE, "vision.patch_size" },
|
||||||
{ LLM_KV_VISION_IMAGE_MEAN, "vision.image_mean" },
|
{ LLM_KV_VISION_IMAGE_MEAN, "vision.image_mean" },
|
||||||
{ LLM_KV_VISION_IMAGE_STD, "vision.image_std" },
|
{ LLM_KV_VISION_IMAGE_STD, "vision.image_std" },
|
||||||
{ LLM_KV_VISION_CLIP_ARCHITECTURE, "vision.clip.architecture" },
|
{ LLM_KV_VISION_VIT_ARCHITECTURE, "vision.vit.architecture" },
|
||||||
{ LLM_KV_VISION_CLIP_CONTEXT_LENGTH, "vision.clip.context_length" },
|
{ LLM_KV_VISION_VIT_CONTEXT_LENGTH, "vision.vit.context_length" },
|
||||||
{ LLM_KV_VISION_CLIP_EMBEDDING_LENGTH, "vision.clip.embedding_length" },
|
{ LLM_KV_VISION_VIT_EMBEDDING_LENGTH, "vision.vit.embedding_length" },
|
||||||
{ LLM_KV_VISION_CLIP_BLOCK_COUNT, "vision.clip.block_count" },
|
{ LLM_KV_VISION_VIT_BLOCK_COUNT, "vision.vit.block_count" },
|
||||||
{ LLM_KV_VISION_CLIP_FEED_FORWARD_LENGTH, "vision.clip.feed_forward_length" },
|
{ LLM_KV_VISION_VIT_FEED_FORWARD_LENGTH, "vision.vit.feed_forward_length" },
|
||||||
{ LLM_KV_VISION_CLIP_PROJECTION_TYPE, "vision.clip.projection_type" },
|
{ LLM_KV_VISION_VIT_PROJECTION_TYPE, "vision.vit.projection_type" },
|
||||||
{ LLM_KV_VISION_CLIP_PROJECTION_DIM, "vision.clip.projection_dim" },
|
{ LLM_KV_VISION_VIT_PROJECTION_DIM, "vision.vit.projection_dim" },
|
||||||
{ LLM_KV_VISION_CLIP_USE_GELU, "vision.clip.use_gelu" },
|
{ LLM_KV_VISION_VIT_USE_GELU, "vision.vit.use_gelu" },
|
||||||
{ LLM_KV_VISION_CLIP_MAX_POS_EMBD, "vision.clip.max_position_embeddings" },
|
{ LLM_KV_VISION_VIT_MAX_POS_EMBD, "vision.vit.max_position_embeddings" },
|
||||||
{ LLM_KV_VISION_CLIP_MAX_SLICES, "vision.clip.max_slices" },
|
{ LLM_KV_VISION_VIT_MAX_SLICES, "vision.vit.max_slices" },
|
||||||
{ LLM_KV_VISION_CLIP_PROJECTOR_TYPE, "vision.clip.projector_type" },
|
{ LLM_KV_VISION_VIT_PROJECTOR_TYPE, "vision.vit.projector_type" },
|
||||||
{ LLM_KV_VISION_CLIP_SELECT_LAYER, "vision.clip.select_layer" },
|
{ LLM_KV_VISION_VIT_SELECT_LAYER, "vision.vit.select_layer" },
|
||||||
{ LLM_KV_VISION_CLIP_PATCH_MERGE_TYPE, "vision.clip.patch_merge_type" },
|
{ LLM_KV_VISION_VIT_PATCH_MERGE_TYPE, "vision.vit.patch_merge_type" },
|
||||||
{ LLM_KV_VISION_CLIP_HEAD_COUNT, "vision.clip.attention.head_count" },
|
{ LLM_KV_VISION_VIT_HEAD_COUNT, "vision.vit.attention.head_count" },
|
||||||
{ LLM_KV_VISION_CLIP_LAYERNORM_EPS, "vision.clip.attention.layer_norm_epsilon" },
|
{ LLM_KV_VISION_VIT_LAYERNORM_EPS, "vision.vit.attention.layer_norm_epsilon" },
|
||||||
|
|
||||||
// deprecated
|
// deprecated
|
||||||
{ LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
|
{ LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
|
||||||
|
@ -205,21 +205,21 @@ enum llm_kv {
|
|||||||
LLM_KV_VISION_PATCH_SIZE,
|
LLM_KV_VISION_PATCH_SIZE,
|
||||||
LLM_KV_VISION_IMAGE_MEAN,
|
LLM_KV_VISION_IMAGE_MEAN,
|
||||||
LLM_KV_VISION_IMAGE_STD,
|
LLM_KV_VISION_IMAGE_STD,
|
||||||
LLM_KV_VISION_CLIP_ARCHITECTURE,
|
LLM_KV_VISION_VIT_ARCHITECTURE,
|
||||||
LLM_KV_VISION_CLIP_CONTEXT_LENGTH,
|
LLM_KV_VISION_VIT_CONTEXT_LENGTH,
|
||||||
LLM_KV_VISION_CLIP_EMBEDDING_LENGTH,
|
LLM_KV_VISION_VIT_EMBEDDING_LENGTH,
|
||||||
LLM_KV_VISION_CLIP_BLOCK_COUNT,
|
LLM_KV_VISION_VIT_BLOCK_COUNT,
|
||||||
LLM_KV_VISION_CLIP_FEED_FORWARD_LENGTH,
|
LLM_KV_VISION_VIT_FEED_FORWARD_LENGTH,
|
||||||
LLM_KV_VISION_CLIP_PROJECTION_TYPE,
|
LLM_KV_VISION_VIT_PROJECTION_TYPE,
|
||||||
LLM_KV_VISION_CLIP_PROJECTION_DIM,
|
LLM_KV_VISION_VIT_PROJECTION_DIM,
|
||||||
LLM_KV_VISION_CLIP_USE_GELU,
|
LLM_KV_VISION_VIT_USE_GELU,
|
||||||
LLM_KV_VISION_CLIP_MAX_POS_EMBD,
|
LLM_KV_VISION_VIT_MAX_POS_EMBD,
|
||||||
LLM_KV_VISION_CLIP_MAX_SLICES,
|
LLM_KV_VISION_VIT_MAX_SLICES,
|
||||||
LLM_KV_VISION_CLIP_PROJECTOR_TYPE,
|
LLM_KV_VISION_VIT_PROJECTOR_TYPE,
|
||||||
LLM_KV_VISION_CLIP_SELECT_LAYER,
|
LLM_KV_VISION_VIT_SELECT_LAYER,
|
||||||
LLM_KV_VISION_CLIP_PATCH_MERGE_TYPE,
|
LLM_KV_VISION_VIT_PATCH_MERGE_TYPE,
|
||||||
LLM_KV_VISION_CLIP_HEAD_COUNT,
|
LLM_KV_VISION_VIT_HEAD_COUNT,
|
||||||
LLM_KV_VISION_CLIP_LAYERNORM_EPS,
|
LLM_KV_VISION_VIT_LAYERNORM_EPS,
|
||||||
|
|
||||||
// deprecated:
|
// deprecated:
|
||||||
LLM_KV_TOKENIZER_PREFIX_ID,
|
LLM_KV_TOKENIZER_PREFIX_ID,
|
||||||
|
@ -1251,23 +1251,23 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||||||
auto & vparams = clip.hparams;
|
auto & vparams = clip.hparams;
|
||||||
std::string vision_type;
|
std::string vision_type;
|
||||||
ml.get_key(LLM_KV_VISION_TYPE, vision_type, false);
|
ml.get_key(LLM_KV_VISION_TYPE, vision_type, false);
|
||||||
if (vision_type == "clip-vit") {
|
if (vision_type == "vit") {
|
||||||
LLAMA_LOG_INFO("%s: loading clip-vit vision model\n", __func__);
|
LLAMA_LOG_INFO("%s: loading ViT vision model\n", __func__);
|
||||||
has_vision = true;
|
has_vision = true;
|
||||||
ml.get_key(LLM_KV_VISION_IMAGE_SIZE, vparams.image_size, true);
|
ml.get_key(LLM_KV_VISION_IMAGE_SIZE, vparams.image_size, true);
|
||||||
ml.get_key(LLM_KV_VISION_PATCH_SIZE, vparams.patch_size, true);
|
ml.get_key(LLM_KV_VISION_PATCH_SIZE, vparams.patch_size, true);
|
||||||
ml.get_key_or_arr(LLM_KV_VISION_IMAGE_MEAN, vparams.image_mean, 3, true);
|
ml.get_key_or_arr(LLM_KV_VISION_IMAGE_MEAN, vparams.image_mean, 3, true);
|
||||||
ml.get_key_or_arr(LLM_KV_VISION_IMAGE_STD, vparams.image_std, 3, true);
|
ml.get_key_or_arr(LLM_KV_VISION_IMAGE_STD, vparams.image_std, 3, true);
|
||||||
ml.get_key(LLM_KV_VISION_CLIP_EMBEDDING_LENGTH, vparams.hidden_size, true);
|
ml.get_key(LLM_KV_VISION_VIT_EMBEDDING_LENGTH, vparams.hidden_size, true);
|
||||||
ml.get_key(LLM_KV_VISION_CLIP_BLOCK_COUNT, vparams.n_layer, true);
|
ml.get_key(LLM_KV_VISION_VIT_BLOCK_COUNT, vparams.n_layer, true);
|
||||||
ml.get_key(LLM_KV_VISION_CLIP_FEED_FORWARD_LENGTH, vparams.n_intermediate, true);
|
ml.get_key(LLM_KV_VISION_VIT_FEED_FORWARD_LENGTH, vparams.n_intermediate, true);
|
||||||
ml.get_key(LLM_KV_VISION_CLIP_HEAD_COUNT, vparams.n_head, true);
|
ml.get_key(LLM_KV_VISION_VIT_HEAD_COUNT, vparams.n_head, true);
|
||||||
ml.get_key(LLM_KV_VISION_CLIP_LAYERNORM_EPS, vparams.eps, true);
|
ml.get_key(LLM_KV_VISION_VIT_LAYERNORM_EPS, vparams.eps, true);
|
||||||
ml.get_key(LLM_KV_VISION_CLIP_SELECT_LAYER, vparams.select_layer, true);
|
ml.get_key(LLM_KV_VISION_VIT_SELECT_LAYER, vparams.select_layer, true);
|
||||||
ml.get_key(LLM_KV_VISION_CLIP_MAX_POS_EMBD, vparams.max_pos_embd, true);
|
ml.get_key(LLM_KV_VISION_VIT_MAX_POS_EMBD, vparams.max_pos_embd, true);
|
||||||
{
|
{
|
||||||
std::string name;
|
std::string name;
|
||||||
ml.get_key(LLM_KV_VISION_CLIP_PROJECTOR_TYPE, name, true);
|
ml.get_key(LLM_KV_VISION_VIT_PROJECTOR_TYPE, name, true);
|
||||||
vparams.proj_type = clip_projector_type_from_name(name);
|
vparams.proj_type = clip_projector_type_from_name(name);
|
||||||
if (vparams.proj_type == CLIP_PROJECTOR_TYPE_UNKNOWN) {
|
if (vparams.proj_type == CLIP_PROJECTOR_TYPE_UNKNOWN) {
|
||||||
throw std::runtime_error(format("unsupported clip projector type: %s", name.c_str()));
|
throw std::runtime_error(format("unsupported clip projector type: %s", name.c_str()));
|
||||||
@ -1275,12 +1275,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||||||
}
|
}
|
||||||
{
|
{
|
||||||
std::string name;
|
std::string name;
|
||||||
ml.get_key(LLM_KV_VISION_CLIP_PATCH_MERGE_TYPE, name, false);
|
ml.get_key(LLM_KV_VISION_VIT_PATCH_MERGE_TYPE, name, false);
|
||||||
vparams.mm_patch_merge_type = mm_patch_merge_from_name(name);
|
vparams.mm_patch_merge_type = mm_patch_merge_from_name(name);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
std::string arch;
|
std::string arch;
|
||||||
ml.get_key(LLM_KV_VISION_CLIP_ARCHITECTURE, arch, true);
|
ml.get_key(LLM_KV_VISION_VIT_ARCHITECTURE, arch, true);
|
||||||
vparams.arch = vision_arch_from_string(arch);
|
vparams.arch = vision_arch_from_string(arch);
|
||||||
if (vparams.arch == VISION_ARCH_UNKNOWN) {
|
if (vparams.arch == VISION_ARCH_UNKNOWN) {
|
||||||
throw std::runtime_error(format("unsupported vision arch: %s", arch.c_str()));
|
throw std::runtime_error(format("unsupported vision arch: %s", arch.c_str()));
|
||||||
|
Loading…
Reference in New Issue
Block a user