diff --git a/Makefile b/Makefile index 1be0154ee..f9cd986c7 100644 --- a/Makefile +++ b/Makefile @@ -950,12 +950,12 @@ llama-llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/lla $(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp) $(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS) -llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp examples/llava/minicpmv_wrapper.h examples/llava/minicpmv_wrapper.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp examples/llava/minicpmv-wrapper.h examples/llava/minicpmv-wrapper.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual $(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp) - $(CXX) $(CXXFLAGS) -c examples/llava/minicpmv_wrapper.cpp -o $(call GET_OBJ_FILE, examples/llava/minicpmv_wrapper.cpp) - $(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp examples/llava/minicpmv_wrapper.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) $(call GET_OBJ_FILE, examples/llava/minicpmv_wrapper.cpp) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c examples/llava/minicpmv-wrapper.cpp -o $(call GET_OBJ_FILE, examples/llava/minicpmv-wrapper.cpp) + $(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp examples/llava/minicpmv-wrapper.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) $(call GET_OBJ_FILE, examples/llava/minicpmv-wrapper.cpp) -o $@ $(LDFLAGS) llama-baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt index c84fc643a..2978225c5 100644 --- a/examples/llava/CMakeLists.txt +++ b/examples/llava/CMakeLists.txt @@ -44,7 +44,7 @@ install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) -add_library(minicpmv_wrapper OBJECT - minicpmv_wrapper.cpp +add_library(minicpmv-wrapper OBJECT + minicpmv-wrapper.cpp ) -target_link_libraries(minicpmv_wrapper PRIVATE llava ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(minicpmv-wrapper PRIVATE llava ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/llava/README_minicpmv2.5.md b/examples/llava/README-minicpmv2.5.md similarity index 100% rename from examples/llava/README_minicpmv2.5.md rename to examples/llava/README-minicpmv2.5.md diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 33d61e3b1..a74cdc147 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -77,7 +77,7 @@ static std::string format(const char * fmt, ...) { #define KEY_HAS_TEXT_ENC "clip.has_text_encoder" #define KEY_HAS_VIS_ENC "clip.has_vision_encoder" #define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector" -#define KEY_HAS_MiniCPMV_PROJ "clip.has_minicpmv_projector" +#define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector" #define KEY_USE_GELU "clip.use_gelu" #define KEY_N_EMBD "clip.%s.embedding_length" #define KEY_N_FF "clip.%s.feed_forward_length" @@ -124,8 +124,7 @@ static std::string format(const char * fmt, ...) { #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s" #define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s" #define TN_IMAGE_NEWLINE "model.image_newline" -// MINICPMV -// #define TN_MINICPMV_POS_EMBD "resampler.pos_embed" + #define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k" #define TN_MINICPMV_QUERY "resampler.query" #define TN_MINICPMV_PROJ "resampler.proj.weight" @@ -502,7 +501,6 @@ struct clip_vision_model { struct ggml_tensor * mm_model_peg_0_b; // MINICPMV projection - // struct ggml_tensor * mm_model_pos_embed; struct ggml_tensor * mm_model_pos_embed_k; struct ggml_tensor * mm_model_query; struct ggml_tensor * mm_model_proj; @@ -554,7 +552,7 @@ struct clip_ctx { ggml_gallocr_t compute_alloc = NULL; }; -static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct load_image_size * load_image_size, bool is_inf = false) { +static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) { if (!ctx->has_vision_encoder) { LOG_TEE("This gguf file seems to have no vision encoder\n"); return nullptr; @@ -568,11 +566,11 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 int image_size_height = image_size; if (ctx->has_minicpmv_projector) { if(load_image_size==nullptr){ - load_image_size= load_image_size_init(); + load_image_size= clip_image_size_init(); } - LOG_TEE("%s : %d %d\n", __func__, load_image_size->image_size_width, load_image_size->image_size_height); - image_size_width = load_image_size->image_size_width; - image_size_height = load_image_size->image_size_height; + LOG_TEE("%s : %d %d\n", __func__, load_image_size->width, load_image_size->height); + image_size_width = load_image_size->width; + image_size_height = load_image_size->height; if (is_inf){ image_size_width = imgs->data->nx; image_size_height = imgs->data->ny; @@ -610,7 +608,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size); inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); - + if (ctx->has_patch_bias) { // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp)); inp = ggml_add(ctx0, inp, model.patch_bias); @@ -926,7 +924,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]); embeddings = peg_0; } - else { GGML_ASSERT(false); } @@ -999,7 +996,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 } // read and create ggml_context containing the tensors and their data -struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1, struct load_image_size * load_image_size) { +struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1, struct clip_image_size * load_image_size) { struct ggml_context * meta = NULL; struct gguf_init_params params = { @@ -1468,10 +1465,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1, s return new_clip; } -struct load_image_size * load_image_size_init() { - struct load_image_size * load_image_size = new struct load_image_size(); - load_image_size->image_size_width = 448; - load_image_size->image_size_height = 448; +struct clip_image_size * clip_image_size_init() { + struct clip_image_size * load_image_size = new struct clip_image_size(); + load_image_size->width = 448; + load_image_size->height = 448; return load_image_size; } @@ -2069,7 +2066,7 @@ static std::vector> get_2d_sincos_pos_embed(int embed_dim, co return pos_embed_2d; } -bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec, struct load_image_size * load_image_size) { +bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec, struct clip_image_size * load_image_size) { if (!ctx->has_vision_encoder) { LOG_TEE("This gguf file seems to have no vision encoder\n"); return false; @@ -2081,7 +2078,7 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3 return clip_image_batch_encode(ctx, n_threads, &imgs, vec, load_image_size); } -bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec, struct load_image_size * load_image_size) { +bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec, struct clip_image_size * load_image_size) { if (!ctx->has_vision_encoder) { LOG_TEE("This gguf file seems to have no vision encoder\n"); return false; @@ -2103,7 +2100,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima const auto & model = ctx->vision_model; const auto & hparams = model.hparams; - const int image_size = hparams.image_size; + const int image_size = hparams.image_size; int image_size_width = image_size; int image_size_height = image_size; if (ctx->has_minicpmv_projector) { @@ -2160,11 +2157,11 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima // -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23 struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed"); if(load_image_size==nullptr){ - load_image_size= load_image_size_init(); + load_image_size= clip_image_size_init(); } - LOG_TEE("%s : %d %d\n", __func__, load_image_size->image_size_width, load_image_size->image_size_height); - int pos_w = load_image_size->image_size_width/patch_size; - int pos_h = load_image_size->image_size_height/patch_size; + LOG_TEE("%s : %d %d\n", __func__, load_image_size->width, load_image_size->height); + int pos_w = load_image_size->width/patch_size; + int pos_h = load_image_size->height/patch_size; int embed_dim = 4096; auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h)); diff --git a/examples/llava/clip.h b/examples/llava/clip.h index da6f5d2b7..f44e38ee1 100644 --- a/examples/llava/clip.h +++ b/examples/llava/clip.h @@ -26,9 +26,9 @@ extern "C" { struct clip_ctx; -struct load_image_size { - int image_size_width; - int image_size_height; +struct clip_image_size { + int width; + int height; }; struct clip_image_u8_batch { struct clip_image_u8 * data; @@ -40,7 +40,7 @@ struct clip_image_f32_batch { size_t size; }; -CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity, struct load_image_size * load_image_size = nullptr); +CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity, struct clip_image_size * load_image_size); CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity); CLIP_API void clip_free(struct clip_ctx * ctx); @@ -59,7 +59,7 @@ CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx); CLIP_API int clip_n_patches (const struct clip_ctx * ctx); CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx); -CLIP_API struct load_image_size * load_image_size_init(); +CLIP_API struct clip_image_size * clip_image_size_init(); CLIP_API struct clip_image_u8 * clip_image_u8_init (); CLIP_API struct clip_image_f32 * clip_image_f32_init(); @@ -80,8 +80,8 @@ CLIP_API void uhd_normalize_image_u8_to_f32(struct clip_ctx * ctx, const clip_im CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx); -CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec, struct load_image_size * load_image_size = nullptr); -CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec, struct load_image_size * load_image_size = nullptr); +CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec, struct clip_image_size * load_image_size); +CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec, struct clip_image_size * load_image_size); CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype); diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 1730faa8e..98e7b5c31 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -413,7 +413,7 @@ void llava_image_embed_free(struct llava_image_embed * embed) { free(embed); } -static bool encode_image_with_clip_uhd(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos, struct load_image_size * load_image_size) { +static bool encode_image_with_clip_uhd(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos, struct clip_image_size * load_image_size) { // std::vector img_res_v; // format VectN x H x W x RGB (N x 448 x 448 x 3) clip_image_f32 * img_res_v = clip_image_f32_init(); @@ -686,10 +686,10 @@ struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx * float* image_embed = NULL; int n_image_pos = 0; int patch_size=14; - struct load_image_size * load_image_size = load_image_size_init(); - load_image_size->image_size_width = imgs[i][j]->nx; - load_image_size->image_size_height = imgs[i][j]->ny; - LOG_TEE("%s : %d %d\n", __func__, load_image_size->image_size_width, load_image_size->image_size_height); + struct clip_image_size * load_image_size = clip_image_size_init(); + load_image_size->width = imgs[i][j]->nx; + load_image_size->height = imgs[i][j]->ny; + LOG_TEE("%s : %d %d\n", __func__, load_image_size->width, load_image_size->height); bool image_embed_result = llava_image_embed_make_with_clip_img_uhd(ctx_clip, n_threads, only_v2_5_reshape_by_patch(imgs[i][j], patch_size), &image_embed, &n_image_pos, load_image_size); if (!image_embed_result) { LOG_TEE("%s: coulnd't embed the image\n", __func__); @@ -705,7 +705,7 @@ struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx * return results; } -bool llava_image_embed_make_with_clip_img_uhd(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out, struct load_image_size * load_image_size) { +bool llava_image_embed_make_with_clip_img_uhd(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out, struct clip_image_size * load_image_size) { float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model if (!image_embd) { LOG_TEE("Unable to allocate memory for image embeddings\n"); @@ -724,50 +724,6 @@ bool llava_image_embed_make_with_clip_img_uhd(clip_ctx * ctx_clip, int n_threads return true; } -bool llava_image_embed_make_with_clip_img_ollama(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) { - auto embeds = llava_image_embed_make_with_bytes_uhd(ctx_clip, n_threads, img); - auto image_embed_slices = embeds->image_embeds; - if (!image_embed_slices[0][0]){ - LOG_TEE("%s: failed to embeding image\n", __func__); - return false; - } - std::string fname = "./examples/minicpm-v2.5/slice_token_for_ollama.raw"; - unsigned char* slice_token; - long image_bytes_length; - auto loaded = load_file_to_bytes(fname.c_str(), &slice_token, &image_bytes_length); - if (!loaded) { - LOG_TEE("%s: failed to load %s\n", __func__, fname.c_str()); - return false; - } - - float * all_image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*61); - int all_n_img_pos=0; - int token_len = clip_n_mmproj_embd(ctx_clip)*sizeof(float); - - std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token, token_len); - std::memcpy(all_image_embd+token_len*all_n_img_pos, image_embed_slices[0][0]->embed, 96*token_len); - all_n_img_pos+=clip_n_patches(ctx_clip); - std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token+token_len, token_len); - if (image_embed_slices.size() > 1) { - std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token+token_len*2, token_len); - for (size_t i = 1; i < image_embed_slices.size(); ++i) { - for (size_t j = 0; j < image_embed_slices[i].size(); ++j) { - std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token, token_len); - std::memcpy(all_image_embd+token_len*all_n_img_pos, image_embed_slices[i][j]->embed, 96*token_len); - all_n_img_pos+=clip_n_patches(ctx_clip); - std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token+token_len, token_len); - if (j == image_embed_slices[i].size() - 1) { - std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token+token_len*4, token_len); - } - } - } - std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token+token_len*3, token_len); - } - *image_embd_out = all_image_embd; - *n_img_pos_out = all_n_img_pos; - return true; -} - struct uhd_image_embed * llava_image_embed_make_with_filename_uhd(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) { unsigned char* image_bytes; long image_bytes_length; diff --git a/examples/llava/llava.h b/examples/llava/llava.h index 95fb42429..4e124a142 100644 --- a/examples/llava/llava.h +++ b/examples/llava/llava.h @@ -45,8 +45,7 @@ LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed); /** build an image embed from image file bytes */ LLAVA_API struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img); /** build an image embed from a path to an image filename */ -LLAVA_API bool llava_image_embed_make_with_clip_img_uhd(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out, struct load_image_size * load_image_size = nullptr); -LLAVA_API bool llava_image_embed_make_with_clip_img_ollama(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out); +LLAVA_API bool llava_image_embed_make_with_clip_img_uhd(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out, struct clip_image_size * load_image_size); LLAVA_API struct uhd_image_embed * llava_image_embed_make_with_filename_uhd(struct clip_ctx * ctx_clip, int n_threads, const char * image_path); LLAVA_API void llava_image_embed_free_uhd(struct uhd_image_embed * embed); diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp index b68947116..da2ffdbeb 100644 --- a/examples/llava/minicpmv-cli.cpp +++ b/examples/llava/minicpmv-cli.cpp @@ -3,7 +3,7 @@ #include "common.h" #include "clip.h" #include "llava.h" -#include "minicpmv_wrapper.h" +#include "minicpmv-wrapper.h" #include "llama.h" #include diff --git a/examples/llava/minicpmv_wrapper.cpp b/examples/llava/minicpmv-wrapper.cpp similarity index 97% rename from examples/llava/minicpmv_wrapper.cpp rename to examples/llava/minicpmv-wrapper.cpp index 3d72a4599..4e7dc24a4 100644 --- a/examples/llava/minicpmv_wrapper.cpp +++ b/examples/llava/minicpmv-wrapper.cpp @@ -2,7 +2,7 @@ #include "common.h" #include "clip.h" #include "llava.h" -#include "minicpmv_wrapper.h" +#include "minicpmv-wrapper.h" #include "llama.h" #include #include @@ -58,7 +58,7 @@ struct clip_ctx * clip_init_context(gpt_params * params) { if (prompt.empty()) { prompt = "describe the image in detail."; } - struct load_image_size * load_image_size = load_image_size_init(); + struct clip_image_size * load_image_size = clip_image_size_init(); auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1, load_image_size); return ctx_clip; } @@ -99,8 +99,7 @@ bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) { bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){ std::string str2 = str; std::vector embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true); - eval_tokens(ctx_llama, embd_inp, n_batch, n_past); - return true; + return eval_tokens(ctx_llama, embd_inp, n_batch, n_past); } void process_image(struct minicpmv_context * ctx_llava, std::vector> image_embed_slices, gpt_params * params, int &n_past) { diff --git a/examples/llava/minicpmv_wrapper.h b/examples/llava/minicpmv-wrapper.h similarity index 100% rename from examples/llava/minicpmv_wrapper.h rename to examples/llava/minicpmv-wrapper.h