mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-02-05 16:10:42 +01:00
reuse LLM_ARCH and LLM_TENSOR
This commit is contained in:
parent
431bb08059
commit
bd0714b977
@ -63,6 +63,9 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
|
||||
{ LLM_ARCH_CHAMELEON, "chameleon" },
|
||||
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
|
||||
{ LLM_ARCH_VISION_LLAVA, "llava" },
|
||||
{ LLM_ARCH_VISION_MOBILEVLM, "mobilevlm" },
|
||||
{ LLM_ARCH_VISION_MINICPMV, "minicpmv" },
|
||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||
};
|
||||
|
||||
@ -1314,6 +1317,70 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_POS_NET_ATTN_OUT, "posnet.%d.attn_output" },
|
||||
},
|
||||
},
|
||||
// vision
|
||||
{
|
||||
LLM_ARCH_VISION_LLAVA,
|
||||
{
|
||||
{ LLM_TENSOR_V_MMPROJ, "v.mmproj_%d" },
|
||||
{ LLM_TENSOR_V_ENC_EMBD_CLS, "v.enc.embd.cls" },
|
||||
{ LLM_TENSOR_V_ENC_EMBD_PATCH, "v.enc.embd.patch" },
|
||||
{ LLM_TENSOR_V_ENC_EMBD_POS, "v.enc.embd.pos" },
|
||||
{ LLM_TENSOR_V_ENC_ATTN_Q, "v.enc.blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_V_ENC_ATTN_K, "v.enc.blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_V_ENC_ATTN_V, "v.enc.blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_V_ENC_INPUT_NORM, "v.enc.blk.%d.input_norm" },
|
||||
{ LLM_TENSOR_V_ENC_OUTPUT, "v.enc.blk.%d.output" },
|
||||
{ LLM_TENSOR_V_ENC_OUTPUT_NORM, "v.enc.blk.%d.output_norm" },
|
||||
{ LLM_TENSOR_V_ENC_FFN_UP, "v.enc.blk.%d.ffn_up" },
|
||||
{ LLM_TENSOR_V_ENC_FFN_DOWN, "v.enc.blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_V_PRE_NORM, "v.pre_norm" },
|
||||
{ LLM_TENSOR_V_POST_NORM, "v.post_norm" },
|
||||
}
|
||||
},
|
||||
{
|
||||
LLM_ARCH_VISION_MOBILEVLM,
|
||||
{
|
||||
{ LLM_TENSOR_V_MMPROJ_MLP, "v.mmproj.mlp.%d" },
|
||||
{ LLM_TENSOR_V_MMPROJ_PEG, "v.mmproj.peg.%d" },
|
||||
{ LLM_TENSOR_V_ENC_EMBD_CLS, "v.enc.embd.cls" },
|
||||
{ LLM_TENSOR_V_ENC_EMBD_PATCH, "v.enc.embd.patch" },
|
||||
{ LLM_TENSOR_V_ENC_EMBD_POS, "v.enc.embd.pos" },
|
||||
{ LLM_TENSOR_V_ENC_ATTN_Q, "v.enc.blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_V_ENC_ATTN_K, "v.enc.blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_V_ENC_ATTN_V, "v.enc.blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_V_ENC_INPUT_NORM, "v.enc.blk.%d.input_norm" },
|
||||
{ LLM_TENSOR_V_ENC_OUTPUT, "v.enc.blk.%d.output" },
|
||||
{ LLM_TENSOR_V_ENC_OUTPUT_NORM, "v.enc.blk.%d.output_norm" },
|
||||
{ LLM_TENSOR_V_ENC_FFN_UP, "v.enc.blk.%d.ffn_up" },
|
||||
{ LLM_TENSOR_V_ENC_FFN_DOWN, "v.enc.blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_V_PRE_NORM, "v.pre_norm" },
|
||||
{ LLM_TENSOR_V_POST_NORM, "v.post_norm" },
|
||||
}
|
||||
},
|
||||
{
|
||||
LLM_ARCH_VISION_MINICPMV,
|
||||
{
|
||||
{ LLM_TENSOR_V_ENC_EMBD_PATCH, "v.enc.embd.patch" },
|
||||
{ LLM_TENSOR_V_ENC_EMBD_POS, "v.enc.embd.pos" },
|
||||
{ LLM_TENSOR_V_ENC_ATTN_Q, "v.enc.blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_V_ENC_ATTN_K, "v.enc.blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_V_ENC_ATTN_V, "v.enc.blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_V_ENC_INPUT_NORM, "v.enc.blk.%d.input_norm" },
|
||||
{ LLM_TENSOR_V_ENC_OUTPUT, "v.enc.blk.%d.output" },
|
||||
{ LLM_TENSOR_V_ENC_OUTPUT_NORM, "v.enc.blk.%d.output_norm" },
|
||||
{ LLM_TENSOR_V_ENC_FFN_UP, "v.enc.blk.%d.ffn_up" },
|
||||
{ LLM_TENSOR_V_ENC_FFN_DOWN, "v.enc.blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_V_RESMPL_POS_EMBD_K, "v.resmpl.pos_embd_k" },
|
||||
{ LLM_TENSOR_V_RESMPL_ATTN_IN, "v.resmpl.attn_in" },
|
||||
{ LLM_TENSOR_V_RESMPL_ATTN_OUT, "v.resmpl.attn_out" },
|
||||
{ LLM_TENSOR_V_RESMPL_KV_PROJ, "v.resmpl.kv_proj" },
|
||||
{ LLM_TENSOR_V_RESMPL_NORM_POST, "v.resmpl.norm_post" },
|
||||
{ LLM_TENSOR_V_RESMPL_NORM_KV, "v.resmpl.norm_kv" },
|
||||
{ LLM_TENSOR_V_RESMPL_NORM_Q, "v.resmpl.norm_q" },
|
||||
{ LLM_TENSOR_V_RESMPL_PROJ, "v.resmpl.proj" },
|
||||
{ LLM_TENSOR_V_RESMPL_QUERY, "v.resmpl.query" },
|
||||
}
|
||||
},
|
||||
{
|
||||
LLM_ARCH_UNKNOWN,
|
||||
{
|
||||
@ -1322,72 +1389,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
},
|
||||
};
|
||||
|
||||
static const std::map<vision_arch, std::map<vision_tensor, const char *>> VISION_TENSOR_NAMES = {
|
||||
{
|
||||
VISION_ARCH_LLAVA,
|
||||
{
|
||||
{ VISION_TENSOR_MMPROJ, "v.mmproj_%d" },
|
||||
{ VISION_TENSOR_ENC_EMBD_CLS, "v.enc.embd.cls" },
|
||||
{ VISION_TENSOR_ENC_EMBD_PATCH, "v.enc.embd.patch" },
|
||||
{ VISION_TENSOR_ENC_EMBD_POS, "v.enc.embd.pos" },
|
||||
{ VISION_TENSOR_ENC_ATTN_Q, "v.enc.blk.%d.attn_q" },
|
||||
{ VISION_TENSOR_ENC_ATTN_K, "v.enc.blk.%d.attn_k" },
|
||||
{ VISION_TENSOR_ENC_ATTN_V, "v.enc.blk.%d.attn_v" },
|
||||
{ VISION_TENSOR_ENC_INPUT_NORM, "v.enc.blk.%d.input_norm" },
|
||||
{ VISION_TENSOR_ENC_OUTPUT, "v.enc.blk.%d.output" },
|
||||
{ VISION_TENSOR_ENC_OUTPUT_NORM, "v.enc.blk.%d.output_norm" },
|
||||
{ VISION_TENSOR_ENC_FFN_UP, "v.enc.blk.%d.ffn_up" },
|
||||
{ VISION_TENSOR_ENC_FFN_DOWN, "v.enc.blk.%d.ffn_down" },
|
||||
{ VISION_TENSOR_PRE_NORM, "v.pre_norm" },
|
||||
{ VISION_TENSOR_POST_NORM, "v.post_norm" },
|
||||
}
|
||||
},
|
||||
{
|
||||
VISION_ARCH_MOBILEVLM,
|
||||
{
|
||||
{ VISION_TENSOR_MMPROJ_MLP, "v.mmproj.mlp.%d" },
|
||||
{ VISION_TENSOR_MMPROJ_PEG, "v.mmproj.peg.%d" },
|
||||
{ VISION_TENSOR_ENC_EMBD_CLS, "v.enc.embd.cls" },
|
||||
{ VISION_TENSOR_ENC_EMBD_PATCH, "v.enc.embd.patch" },
|
||||
{ VISION_TENSOR_ENC_EMBD_POS, "v.enc.embd.pos" },
|
||||
{ VISION_TENSOR_ENC_ATTN_Q, "v.enc.blk.%d.attn_q" },
|
||||
{ VISION_TENSOR_ENC_ATTN_K, "v.enc.blk.%d.attn_k" },
|
||||
{ VISION_TENSOR_ENC_ATTN_V, "v.enc.blk.%d.attn_v" },
|
||||
{ VISION_TENSOR_ENC_INPUT_NORM, "v.enc.blk.%d.input_norm" },
|
||||
{ VISION_TENSOR_ENC_OUTPUT, "v.enc.blk.%d.output" },
|
||||
{ VISION_TENSOR_ENC_OUTPUT_NORM, "v.enc.blk.%d.output_norm" },
|
||||
{ VISION_TENSOR_ENC_FFN_UP, "v.enc.blk.%d.ffn_up" },
|
||||
{ VISION_TENSOR_ENC_FFN_DOWN, "v.enc.blk.%d.ffn_down" },
|
||||
{ VISION_TENSOR_PRE_NORM, "v.pre_norm" },
|
||||
{ VISION_TENSOR_POST_NORM, "v.post_norm" },
|
||||
}
|
||||
},
|
||||
{
|
||||
VISION_ARCH_MINICPMV,
|
||||
{
|
||||
{ VISION_TENSOR_ENC_EMBD_PATCH, "v.enc.embd.patch" },
|
||||
{ VISION_TENSOR_ENC_EMBD_POS, "v.enc.embd.pos" },
|
||||
{ VISION_TENSOR_ENC_ATTN_Q, "v.enc.blk.%d.attn_q" },
|
||||
{ VISION_TENSOR_ENC_ATTN_K, "v.enc.blk.%d.attn_k" },
|
||||
{ VISION_TENSOR_ENC_ATTN_V, "v.enc.blk.%d.attn_v" },
|
||||
{ VISION_TENSOR_ENC_INPUT_NORM, "v.enc.blk.%d.input_norm" },
|
||||
{ VISION_TENSOR_ENC_OUTPUT, "v.enc.blk.%d.output" },
|
||||
{ VISION_TENSOR_ENC_OUTPUT_NORM, "v.enc.blk.%d.output_norm" },
|
||||
{ VISION_TENSOR_ENC_FFN_UP, "v.enc.blk.%d.ffn_up" },
|
||||
{ VISION_TENSOR_ENC_FFN_DOWN, "v.enc.blk.%d.ffn_down" },
|
||||
{ VISION_TENSOR_RESMPL_POS_EMBD_K, "v.resmpl.pos_embd_k" },
|
||||
{ VISION_TENSOR_RESMPL_ATTN_IN, "v.resmpl.attn_in" },
|
||||
{ VISION_TENSOR_RESMPL_ATTN_OUT, "v.resmpl.attn_out" },
|
||||
{ VISION_TENSOR_RESMPL_KV_PROJ, "v.resmpl.kv_proj" },
|
||||
{ VISION_TENSOR_RESMPL_NORM_POST, "v.resmpl.norm_post" },
|
||||
{ VISION_TENSOR_RESMPL_NORM_KV, "v.resmpl.norm_kv" },
|
||||
{ VISION_TENSOR_RESMPL_NORM_Q, "v.resmpl.norm_q" },
|
||||
{ VISION_TENSOR_RESMPL_PROJ, "v.resmpl.proj" },
|
||||
{ VISION_TENSOR_RESMPL_QUERY, "v.resmpl.query" },
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
{LLM_TENSOR_TOKEN_EMBD, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
|
||||
{LLM_TENSOR_POS_EMBD, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
|
||||
@ -1537,12 +1538,7 @@ std::string LLM_KV::operator()(llm_kv kv) const {
|
||||
return ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
|
||||
}
|
||||
|
||||
template<>
|
||||
std::string BASE_TN_IMPL<llm_arch, llm_tensor>::str() const {
|
||||
if (LLM_TENSOR_NAMES.find(arch) == LLM_TENSOR_NAMES.end()) {
|
||||
throw std::runtime_error(format("Cannot find tensor name mapping for arch %d", arch));
|
||||
}
|
||||
|
||||
std::string LLM_TN_IMPL::str() const {
|
||||
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
||||
return "__missing__";
|
||||
}
|
||||
@ -1557,26 +1553,6 @@ std::string BASE_TN_IMPL<llm_arch, llm_tensor>::str() const {
|
||||
return name;
|
||||
}
|
||||
|
||||
template<>
|
||||
std::string BASE_TN_IMPL<vision_arch, vision_tensor>::str() const {
|
||||
if (VISION_TENSOR_NAMES.find(arch) == VISION_TENSOR_NAMES.end()) {
|
||||
throw std::runtime_error(format("Cannot find tensor name mapping for arch %d", arch));
|
||||
}
|
||||
|
||||
if (VISION_TENSOR_NAMES.at(arch).find(tensor) == VISION_TENSOR_NAMES.at(arch).end()) {
|
||||
return "__missing__";
|
||||
}
|
||||
|
||||
std::string name = ::format(VISION_TENSOR_NAMES.at(arch).at(tensor), bid, xid);
|
||||
|
||||
if (suffix != nullptr) {
|
||||
name += ".";
|
||||
name += suffix;
|
||||
}
|
||||
|
||||
return name;
|
||||
}
|
||||
|
||||
const char * llm_arch_name(llm_arch arch) {
|
||||
auto it = LLM_ARCH_NAMES.find(arch);
|
||||
if (it == LLM_ARCH_NAMES.end()) {
|
||||
|
@ -66,16 +66,13 @@ enum llm_arch {
|
||||
LLM_ARCH_GRANITE_MOE,
|
||||
LLM_ARCH_CHAMELEON,
|
||||
LLM_ARCH_WAVTOKENIZER_DEC,
|
||||
// vision
|
||||
LLM_ARCH_VISION_LLAVA,
|
||||
LLM_ARCH_VISION_MOBILEVLM,
|
||||
LLM_ARCH_VISION_MINICPMV,
|
||||
LLM_ARCH_UNKNOWN,
|
||||
};
|
||||
|
||||
enum vision_arch {
|
||||
VISION_ARCH_UNKNOWN,
|
||||
VISION_ARCH_LLAVA,
|
||||
VISION_ARCH_MOBILEVLM,
|
||||
VISION_ARCH_MINICPMV,
|
||||
};
|
||||
|
||||
enum llm_kv {
|
||||
LLM_KV_GENERAL_TYPE,
|
||||
LLM_KV_GENERAL_ARCHITECTURE,
|
||||
@ -354,35 +351,33 @@ enum llm_tensor {
|
||||
LLM_TENSOR_POS_NET_ATTN_K,
|
||||
LLM_TENSOR_POS_NET_ATTN_V,
|
||||
LLM_TENSOR_POS_NET_ATTN_OUT,
|
||||
};
|
||||
|
||||
enum vision_tensor {
|
||||
VISION_TENSOR_MMPROJ,
|
||||
VISION_TENSOR_MMPROJ_MLP,
|
||||
VISION_TENSOR_MMPROJ_PEG,
|
||||
VISION_TENSOR_ENC_EMBD_CLS,
|
||||
VISION_TENSOR_ENC_EMBD_PATCH,
|
||||
VISION_TENSOR_ENC_EMBD_POS,
|
||||
VISION_TENSOR_ENC_ATTN_Q,
|
||||
VISION_TENSOR_ENC_ATTN_K,
|
||||
VISION_TENSOR_ENC_ATTN_V,
|
||||
VISION_TENSOR_ENC_INPUT_NORM,
|
||||
VISION_TENSOR_ENC_OUTPUT,
|
||||
VISION_TENSOR_ENC_OUTPUT_NORM,
|
||||
VISION_TENSOR_ENC_FFN_UP,
|
||||
VISION_TENSOR_ENC_FFN_DOWN,
|
||||
VISION_TENSOR_PRE_NORM,
|
||||
VISION_TENSOR_POST_NORM,
|
||||
// minicpmv
|
||||
VISION_TENSOR_RESMPL_POS_EMBD_K,
|
||||
VISION_TENSOR_RESMPL_ATTN_IN,
|
||||
VISION_TENSOR_RESMPL_ATTN_OUT,
|
||||
VISION_TENSOR_RESMPL_KV_PROJ,
|
||||
VISION_TENSOR_RESMPL_NORM_POST,
|
||||
VISION_TENSOR_RESMPL_NORM_KV,
|
||||
VISION_TENSOR_RESMPL_NORM_Q,
|
||||
VISION_TENSOR_RESMPL_PROJ,
|
||||
VISION_TENSOR_RESMPL_QUERY,
|
||||
// vision
|
||||
LLM_TENSOR_V_MMPROJ,
|
||||
LLM_TENSOR_V_MMPROJ_MLP,
|
||||
LLM_TENSOR_V_MMPROJ_PEG,
|
||||
LLM_TENSOR_V_ENC_EMBD_CLS,
|
||||
LLM_TENSOR_V_ENC_EMBD_PATCH,
|
||||
LLM_TENSOR_V_ENC_EMBD_POS,
|
||||
LLM_TENSOR_V_ENC_ATTN_Q,
|
||||
LLM_TENSOR_V_ENC_ATTN_K,
|
||||
LLM_TENSOR_V_ENC_ATTN_V,
|
||||
LLM_TENSOR_V_ENC_INPUT_NORM,
|
||||
LLM_TENSOR_V_ENC_OUTPUT,
|
||||
LLM_TENSOR_V_ENC_OUTPUT_NORM,
|
||||
LLM_TENSOR_V_ENC_FFN_UP,
|
||||
LLM_TENSOR_V_ENC_FFN_DOWN,
|
||||
LLM_TENSOR_V_PRE_NORM,
|
||||
LLM_TENSOR_V_POST_NORM,
|
||||
// vision - minicpmv
|
||||
LLM_TENSOR_V_RESMPL_POS_EMBD_K,
|
||||
LLM_TENSOR_V_RESMPL_ATTN_IN,
|
||||
LLM_TENSOR_V_RESMPL_ATTN_OUT,
|
||||
LLM_TENSOR_V_RESMPL_KV_PROJ,
|
||||
LLM_TENSOR_V_RESMPL_NORM_POST,
|
||||
LLM_TENSOR_V_RESMPL_NORM_KV,
|
||||
LLM_TENSOR_V_RESMPL_NORM_Q,
|
||||
LLM_TENSOR_V_RESMPL_PROJ,
|
||||
LLM_TENSOR_V_RESMPL_QUERY,
|
||||
};
|
||||
|
||||
enum llm_tensor_layer {
|
||||
@ -408,10 +403,9 @@ struct LLM_KV {
|
||||
// std::string name = tn(LLM_TENSOR_TOKEN_EMBD, "bias"); -> "token_embd.bias"
|
||||
// std::string name = tn(LLM_TENSOR_ATTN_NORM, "weight", 3); -> "blk.3.attn_norm.weight"
|
||||
//
|
||||
template<typename Tname, typename Ttensor>
|
||||
struct BASE_TN_IMPL {
|
||||
const Tname arch;
|
||||
const Ttensor tensor;
|
||||
struct LLM_TN_IMPL {
|
||||
const llm_arch arch;
|
||||
const llm_tensor tensor;
|
||||
const char * const suffix;
|
||||
const int bid;
|
||||
const int xid;
|
||||
@ -422,16 +416,15 @@ struct BASE_TN_IMPL {
|
||||
return str();
|
||||
}
|
||||
|
||||
friend bool operator==(const std::string & str, const BASE_TN_IMPL & tn) {
|
||||
friend bool operator==(const std::string & str, const LLM_TN_IMPL & tn) {
|
||||
return str == tn.str();
|
||||
}
|
||||
|
||||
friend bool operator!=(const std::string & str, const BASE_TN_IMPL & tn) {
|
||||
friend bool operator!=(const std::string & str, const LLM_TN_IMPL & tn) {
|
||||
return str != tn.str();
|
||||
}
|
||||
};
|
||||
|
||||
using LLM_TN_IMPL = BASE_TN_IMPL<llm_arch, llm_tensor>;
|
||||
struct LLM_TN {
|
||||
LLM_TN(llm_arch arch) : arch(arch) {}
|
||||
|
||||
@ -446,20 +439,6 @@ struct LLM_TN {
|
||||
}
|
||||
};
|
||||
|
||||
struct VISION_TN {
|
||||
VISION_TN(vision_arch arch) : arch(arch) {}
|
||||
|
||||
vision_arch arch;
|
||||
|
||||
BASE_TN_IMPL<vision_arch, vision_tensor> operator()(vision_tensor tensor, const char * suffix, int bid = -1, int xid = -1) const {
|
||||
return { arch, tensor, suffix, bid, xid };
|
||||
}
|
||||
|
||||
BASE_TN_IMPL<vision_arch, vision_tensor> operator()(vision_tensor tensor, int bid = -1, int xid = -1) const {
|
||||
return { arch, tensor, nullptr, bid, xid };
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
struct llm_tensor_info {
|
||||
llm_tensor_layer layer;
|
||||
@ -470,6 +449,4 @@ const char * llm_arch_name(llm_arch arch);
|
||||
|
||||
llm_arch llm_arch_from_string(const std::string & name);
|
||||
|
||||
vision_arch vision_arch_from_string(const std::string & name);
|
||||
|
||||
const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor);
|
||||
|
@ -1281,8 +1281,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
{
|
||||
std::string arch;
|
||||
ml.get_key(LLM_KV_VISION_VIT_ARCHITECTURE, arch, true);
|
||||
vparams.arch = vision_arch_from_string(arch);
|
||||
if (vparams.arch == VISION_ARCH_UNKNOWN) {
|
||||
vparams.arch = llm_arch_from_string(arch);
|
||||
if (vparams.arch == LLM_ARCH_UNKNOWN) {
|
||||
throw std::runtime_error(format("unsupported vision arch: %s", arch.c_str()));
|
||||
}
|
||||
}
|
||||
@ -3421,7 +3421,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
const int64_t max_pos_embd = vparams.max_pos_embd;
|
||||
const int64_t n_channel = 3; // always RGB
|
||||
const int64_t patch_size = vparams.patch_size;
|
||||
const auto tn = VISION_TN(vparams.arch);
|
||||
const auto tn = LLM_TN(vparams.arch);
|
||||
|
||||
// clip is CPU-only for now
|
||||
clip.buft = ggml_backend_cpu_buffer_type();
|
||||
@ -3429,85 +3429,85 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
clip.layers.resize(n_vlayer);
|
||||
|
||||
switch (vparams.arch) {
|
||||
case VISION_ARCH_LLAVA:
|
||||
case VISION_ARCH_MOBILEVLM:
|
||||
case LLM_ARCH_VISION_LLAVA:
|
||||
case LLM_ARCH_VISION_MOBILEVLM:
|
||||
{
|
||||
if (vparams.arch == VISION_ARCH_LLAVA) {
|
||||
clip.mm_1_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "weight", 1), {n_vembd, n_vff});
|
||||
clip.mm_1_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "bias" , 1), {n_vff});
|
||||
clip.mm_2_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "weight", 2), {n_vff, n_vff});
|
||||
clip.mm_2_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "bias" , 2), {n_vff});
|
||||
} else if (vparams.arch == VISION_ARCH_MOBILEVLM) {
|
||||
clip.mm_model_mlp_0_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ_MLP, "weight", 0), {n_vembd, n_embd});
|
||||
clip.mm_model_mlp_0_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ_MLP, "bias", 0), {n_embd});
|
||||
clip.mm_model_mlp_2_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ_MLP, "weight", 2), {n_embd, n_embd});
|
||||
clip.mm_model_mlp_2_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ_MLP, "bias", 2), {n_embd});
|
||||
clip.mm_model_peg_0_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ_PEG, "weight", 0), {n_channel, n_channel, 1, n_embd});
|
||||
clip.mm_model_peg_0_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ_PEG, "bias", 0), {n_embd});
|
||||
if (vparams.arch == LLM_ARCH_VISION_LLAVA) {
|
||||
clip.mm_1_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ, "weight", 1), {n_vembd, n_vff});
|
||||
clip.mm_1_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ, "bias" , 1), {n_vff});
|
||||
clip.mm_2_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ, "weight", 2), {n_vff, n_vff});
|
||||
clip.mm_2_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ, "bias" , 2), {n_vff});
|
||||
} else if (vparams.arch == LLM_ARCH_VISION_MOBILEVLM) {
|
||||
clip.mm_model_mlp_0_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_MLP, "weight", 0), {n_vembd, n_embd});
|
||||
clip.mm_model_mlp_0_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_MLP, "bias", 0), {n_embd});
|
||||
clip.mm_model_mlp_2_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_MLP, "weight", 2), {n_embd, n_embd});
|
||||
clip.mm_model_mlp_2_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_MLP, "bias", 2), {n_embd});
|
||||
clip.mm_model_peg_0_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_PEG, "weight", 0), {n_channel, n_channel, 1, n_embd});
|
||||
clip.mm_model_peg_0_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_PEG, "bias", 0), {n_embd});
|
||||
}
|
||||
|
||||
clip.class_embedding = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_CLS ), {n_vembd});
|
||||
clip.patch_embeddings = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_vembd});
|
||||
clip.position_embeddings = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_POS, "weight"), {n_vembd, max_pos_embd});
|
||||
clip.class_embedding = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_CLS ), {n_vembd});
|
||||
clip.patch_embeddings = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_vembd});
|
||||
clip.position_embeddings = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_POS, "weight"), {n_vembd, max_pos_embd});
|
||||
|
||||
clip.pre_norm_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_PRE_NORM, "weight"), {n_vembd});
|
||||
clip.pre_norm_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_PRE_NORM, "bias" ), {n_vembd});
|
||||
clip.post_norm_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_POST_NORM, "weight"), {n_vembd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
clip.post_norm_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_POST_NORM, "bias" ), {n_vembd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
clip.pre_norm_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_PRE_NORM, "weight"), {n_vembd});
|
||||
clip.pre_norm_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_PRE_NORM, "bias" ), {n_vembd});
|
||||
clip.post_norm_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_POST_NORM, "weight"), {n_vembd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
clip.post_norm_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_POST_NORM, "bias" ), {n_vembd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
|
||||
for (int i = 0; i < n_vlayer; ++i) {
|
||||
auto & layer = clip.layers[i];
|
||||
|
||||
layer.k_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_K, "weight", i), {n_vembd, n_vembd});
|
||||
layer.k_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_K, "bias" , i), {n_vembd});
|
||||
layer.v_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_V, "weight", i), {n_vembd, n_vembd});
|
||||
layer.v_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_V, "bias" , i), {n_vembd});
|
||||
layer.q_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_Q, "weight", i), {n_vembd, n_vembd});
|
||||
layer.q_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_Q, "bias" , i), {n_vembd});
|
||||
layer.k_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_K, "weight", i), {n_vembd, n_vembd});
|
||||
layer.k_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_K, "bias" , i), {n_vembd});
|
||||
layer.v_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_V, "weight", i), {n_vembd, n_vembd});
|
||||
layer.v_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_V, "bias" , i), {n_vembd});
|
||||
layer.q_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_Q, "weight", i), {n_vembd, n_vembd});
|
||||
layer.q_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_Q, "bias" , i), {n_vembd});
|
||||
|
||||
layer.ffn_up_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_UP, "weight", i), {n_vembd, n_vff});
|
||||
layer.ffn_up_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_UP, "bias" , i), {n_vff});
|
||||
layer.ffn_down_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_DOWN, "weight", i), {n_vff, n_vembd});
|
||||
layer.ffn_down_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_DOWN, "bias" , i), {n_vembd});
|
||||
layer.ffn_up_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_FFN_UP, "weight", i), {n_vembd, n_vff});
|
||||
layer.ffn_up_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_FFN_UP, "bias" , i), {n_vff});
|
||||
layer.ffn_down_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_FFN_DOWN, "weight", i), {n_vff, n_vembd});
|
||||
layer.ffn_down_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_FFN_DOWN, "bias" , i), {n_vembd});
|
||||
|
||||
layer.norm_in_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_INPUT_NORM, "weight", i), {n_vembd});
|
||||
layer.norm_in_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_INPUT_NORM, "bias" , i), {n_vembd});
|
||||
layer.norm_out_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT_NORM, "weight", i), {n_vembd});
|
||||
layer.norm_out_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT_NORM, "bias" , i), {n_vembd});
|
||||
layer.norm_in_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_INPUT_NORM, "weight", i), {n_vembd});
|
||||
layer.norm_in_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_INPUT_NORM, "bias" , i), {n_vembd});
|
||||
layer.norm_out_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_OUTPUT_NORM, "weight", i), {n_vembd});
|
||||
layer.norm_out_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_OUTPUT_NORM, "bias" , i), {n_vembd});
|
||||
|
||||
layer.output_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT, "weight", i), {n_vembd, n_vembd});
|
||||
layer.output_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT, "bias" , i), {n_vembd});
|
||||
layer.output_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_OUTPUT, "weight", i), {n_vembd, n_vembd});
|
||||
layer.output_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_OUTPUT, "bias" , i), {n_vembd});
|
||||
}
|
||||
} break;
|
||||
case VISION_ARCH_MINICPMV:
|
||||
case LLM_ARCH_VISION_MINICPMV:
|
||||
{
|
||||
clip.patch_embeddings = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_vembd});
|
||||
clip.position_embeddings = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_POS, "weight"), {n_vembd, max_pos_embd});
|
||||
clip.patch_embeddings = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_vembd});
|
||||
clip.position_embeddings = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_POS, "weight"), {n_vembd, max_pos_embd});
|
||||
|
||||
// TODO: load all resampler tensors
|
||||
|
||||
for (int i = 0; i < n_vlayer; ++i) {
|
||||
auto & layer = clip.layers[i];
|
||||
|
||||
layer.k_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_K, "weight", i), {n_vembd, n_vembd});
|
||||
layer.k_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_K, "bias" , i), {n_vembd});
|
||||
layer.v_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_V, "weight", i), {n_vembd, n_vembd});
|
||||
layer.v_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_V, "bias" , i), {n_vembd});
|
||||
layer.q_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_Q, "weight", i), {n_vembd, n_vembd});
|
||||
layer.q_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_Q, "bias" , i), {n_vembd});
|
||||
layer.k_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_K, "weight", i), {n_vembd, n_vembd});
|
||||
layer.k_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_K, "bias" , i), {n_vembd});
|
||||
layer.v_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_V, "weight", i), {n_vembd, n_vembd});
|
||||
layer.v_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_V, "bias" , i), {n_vembd});
|
||||
layer.q_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_Q, "weight", i), {n_vembd, n_vembd});
|
||||
layer.q_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_Q, "bias" , i), {n_vembd});
|
||||
|
||||
layer.ffn_up_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_UP, "weight", i), {n_vembd, n_vff});
|
||||
layer.ffn_up_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_UP, "bias" , i), {n_vff});
|
||||
layer.ffn_down_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_DOWN, "weight", i), {n_vff, n_vembd});
|
||||
layer.ffn_down_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_DOWN, "bias" , i), {n_vembd});
|
||||
layer.ffn_up_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_FFN_UP, "weight", i), {n_vembd, n_vff});
|
||||
layer.ffn_up_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_FFN_UP, "bias" , i), {n_vff});
|
||||
layer.ffn_down_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_FFN_DOWN, "weight", i), {n_vff, n_vembd});
|
||||
layer.ffn_down_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_FFN_DOWN, "bias" , i), {n_vembd});
|
||||
|
||||
layer.norm_in_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_INPUT_NORM, "weight", i), {n_vembd});
|
||||
layer.norm_in_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_INPUT_NORM, "bias" , i), {n_vembd});
|
||||
layer.norm_out_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT_NORM, "weight", i), {n_vembd});
|
||||
layer.norm_out_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT_NORM, "bias" , i), {n_vembd});
|
||||
layer.norm_in_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_INPUT_NORM, "weight", i), {n_vembd});
|
||||
layer.norm_in_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_INPUT_NORM, "bias" , i), {n_vembd});
|
||||
layer.norm_out_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_OUTPUT_NORM, "weight", i), {n_vembd});
|
||||
layer.norm_out_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_OUTPUT_NORM, "bias" , i), {n_vembd});
|
||||
|
||||
layer.output_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT, "weight", i), {n_vembd, n_vembd});
|
||||
layer.output_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT, "bias" , i), {n_vembd});
|
||||
layer.output_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_OUTPUT, "weight", i), {n_vembd, n_vembd});
|
||||
layer.output_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_OUTPUT, "bias" , i), {n_vembd});
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
|
@ -393,7 +393,7 @@ struct minicpmv_preprocessor {
|
||||
|
||||
static llama_vision_patches clip_image_preprocess_minicpmv(const clip_context & ctx, const clip_image_u8 & img) {
|
||||
auto & params = ctx.model->hparams;
|
||||
GGML_ASSERT(params.arch == VISION_ARCH_MINICPMV);
|
||||
GGML_ASSERT(params.arch == LLM_ARCH_VISION_MINICPMV);
|
||||
|
||||
static const int max_slice_nums = 9;
|
||||
minicpmv_preprocessor preprocessor;
|
||||
@ -775,7 +775,7 @@ static int32_t clip_image_encode(clip_context & ctx, const llama_vision_patches
|
||||
auto & model = *ctx.model;
|
||||
auto & hparams = ctx.model->hparams;
|
||||
|
||||
if (hparams.arch == VISION_ARCH_LLAVA) {
|
||||
if (hparams.arch == LLM_ARCH_VISION_LLAVA) {
|
||||
GGML_ASSERT(batch_size == 1); // TODO: support multiple images
|
||||
}
|
||||
|
||||
@ -895,7 +895,7 @@ struct llama_vision_patches * llama_vision_patches_init(
|
||||
struct llama_context * ctx,
|
||||
llama_vision_bitmap * bmp) {
|
||||
clip_context & vctx = ctx->vctx;
|
||||
if (vctx.model->hparams.arch == VISION_ARCH_MINICPMV) {
|
||||
if (vctx.model->hparams.arch == LLM_ARCH_VISION_MINICPMV) {
|
||||
return new llama_vision_patches(clip_image_preprocess_minicpmv(vctx, *bmp));
|
||||
}
|
||||
return new llama_vision_patches(clip_image_preprocess(vctx, *bmp));
|
||||
|
@ -22,7 +22,7 @@ enum mm_patch_merge {
|
||||
};
|
||||
|
||||
struct clip_hparams {
|
||||
vision_arch arch = VISION_ARCH_UNKNOWN;
|
||||
llm_arch arch = LLM_ARCH_UNKNOWN;
|
||||
|
||||
uint32_t image_size;
|
||||
uint32_t patch_size;
|
||||
@ -157,18 +157,6 @@ struct llama_vision_patches {
|
||||
std::vector<std::vector<float>> buf; // preprocessed image data
|
||||
};
|
||||
|
||||
inline vision_arch vision_arch_from_string(const std::string & name) {
|
||||
if (name == "llava") {
|
||||
return VISION_ARCH_LLAVA;
|
||||
} else if (name == "mobilevlm") {
|
||||
return VISION_ARCH_MOBILEVLM;
|
||||
} else if (name == "minicpmv") {
|
||||
return VISION_ARCH_MINICPMV;
|
||||
}
|
||||
|
||||
return VISION_ARCH_UNKNOWN;
|
||||
}
|
||||
|
||||
inline mm_patch_merge mm_patch_merge_from_name(std::string & name) {
|
||||
if (name == "flat") {
|
||||
return MM_PATCH_MERGE_FLAT;
|
||||
|
Loading…
Reference in New Issue
Block a user