diff --git a/.gitignore b/.gitignore
index 409c06593..6c3a0abf0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -117,7 +117,6 @@ poetry.toml
 /tests/test-tokenizer-0
 /tests/test-tokenizer-1-bpe
 /tests/test-tokenizer-1-spm
-/openbmb
 
 # Scripts
 !/scripts/install-oneapi.bat
diff --git a/examples/llava/assets/xiaomi14pro_test.jpeg b/examples/llava/assets/xiaomi14pro_test.jpeg
deleted file mode 100644
index 8762c9c7b..000000000
Binary files a/examples/llava/assets/xiaomi14pro_test.jpeg and /dev/null differ
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 056d0f2ad..33d61e3b1 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -554,7 +554,7 @@ struct clip_ctx {
     ggml_gallocr_t compute_alloc = NULL;
 };
 
-static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, std::pair<int, int> load_image_size = {448, 448}, bool is_inf = false) {
+static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct load_image_size * load_image_size, bool is_inf = false) {
     if (!ctx->has_vision_encoder) {
         LOG_TEE("This gguf file seems to have no vision encoder\n");
         return nullptr;
@@ -567,8 +567,12 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
     int image_size_width     = image_size;
     int image_size_height    = image_size;
     if (ctx->has_minicpmv_projector) {
-        image_size_width     = load_image_size.first;
-        image_size_height    = load_image_size.second;
+        if(load_image_size==nullptr){
+            load_image_size= load_image_size_init();
+        }
+        LOG_TEE("%s : %d %d\n", __func__, load_image_size->image_size_width, load_image_size->image_size_height);
+        image_size_width     = load_image_size->image_size_width;
+        image_size_height    = load_image_size->image_size_height;
         if (is_inf){
             image_size_width = imgs->data->nx;
             image_size_height = imgs->data->ny;
@@ -995,7 +999,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
 }
 
 // read and create ggml_context containing the tensors and their data
-struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1, std::pair<int, int> load_image_size) {
+struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1, struct load_image_size * load_image_size) {
     struct ggml_context * meta = NULL;
 
     struct gguf_init_params params = {
@@ -1464,6 +1468,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1, s
     return new_clip;
 }
 
+struct load_image_size * load_image_size_init() {
+    struct load_image_size * load_image_size = new struct load_image_size();
+    load_image_size->image_size_width = 448;
+    load_image_size->image_size_height = 448;
+    return load_image_size;
+}
+
 struct clip_image_u8 * clip_image_u8_init() {
     return new clip_image_u8();
 }
@@ -2058,7 +2069,7 @@ static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, co
     return pos_embed_2d;
 }
 
-bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec, std::pair<int, int> load_image_size) {
+bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec, struct load_image_size * load_image_size) {
     if (!ctx->has_vision_encoder) {
         LOG_TEE("This gguf file seems to have no vision encoder\n");
         return false;
@@ -2070,7 +2081,7 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
     return clip_image_batch_encode(ctx, n_threads, &imgs, vec, load_image_size);
 }
 
-bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec, std::pair<int, int> load_image_size) {
+bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec, struct load_image_size * load_image_size) {
     if (!ctx->has_vision_encoder) {
         LOG_TEE("This gguf file seems to have no vision encoder\n");
         return false;
@@ -2148,8 +2159,12 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
             //    -> https://huggingface.co/Qwen/Qwen-VL/tree/main
             //    -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
             struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed");
-            int pos_w = load_image_size.first/patch_size;
-            int pos_h = load_image_size.second/patch_size;
+            if(load_image_size==nullptr){
+                load_image_size= load_image_size_init();
+            }
+            LOG_TEE("%s : %d %d\n", __func__, load_image_size->image_size_width, load_image_size->image_size_height);
+            int pos_w = load_image_size->image_size_width/patch_size;
+            int pos_h = load_image_size->image_size_height/patch_size;
             int embed_dim = 4096;
             auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
 
diff --git a/examples/llava/clip.h b/examples/llava/clip.h
index 232fe50a8..da6f5d2b7 100644
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -3,7 +3,6 @@
 
 #include <stddef.h>
 #include <stdint.h>
-#include <utility>
 
 #ifdef LLAMA_SHARED
 #    if defined(_WIN32) && !defined(__MINGW32__)
@@ -27,6 +26,10 @@ extern "C" {
 
 struct clip_ctx;
 
+struct load_image_size {
+    int image_size_width;
+    int image_size_height;
+};
 struct clip_image_u8_batch {
     struct clip_image_u8 * data;
     size_t size;
@@ -37,7 +40,7 @@ struct clip_image_f32_batch {
     size_t size;
 };
 
-CLIP_API struct clip_ctx * clip_model_load    (const char * fname, int verbosity, std::pair<int, int> load_image_size = {448, 448});
+CLIP_API struct clip_ctx * clip_model_load    (const char * fname, int verbosity, struct load_image_size * load_image_size = nullptr);
 CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);
 
 CLIP_API void clip_free(struct clip_ctx * ctx);
@@ -56,6 +59,7 @@ CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
 CLIP_API int clip_n_patches    (const struct clip_ctx * ctx);
 CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
 
+CLIP_API struct load_image_size * load_image_size_init();
 CLIP_API struct clip_image_u8  * clip_image_u8_init ();
 CLIP_API struct clip_image_f32 * clip_image_f32_init();
 
@@ -76,8 +80,8 @@ CLIP_API void uhd_normalize_image_u8_to_f32(struct clip_ctx * ctx, const clip_im
 
 CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
 
-CLIP_API bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec, std::pair<int, int> load_image_size = {448, 448});
-CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec, std::pair<int, int> load_image_size = {448, 448});
+CLIP_API bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec, struct load_image_size * load_image_size = nullptr);
+CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec, struct load_image_size * load_image_size = nullptr);
 
 CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
 
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index dc9a01a45..1730faa8e 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -31,6 +31,9 @@ struct clip_image_grid_shape {
     int second;
 };
 
+struct uhd_image_embed {
+    std::vector<std::vector<struct llava_image_embed *>> image_embeds;
+};
 /**
  * Selects the best resolution from a list of possible resolutions based on the original size.
  *
@@ -410,7 +413,7 @@ void llava_image_embed_free(struct llava_image_embed * embed) {
     free(embed);
 }
 
-static bool encode_image_with_clip_uhd(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos, std::pair<int, int> load_image_size) {
+static bool encode_image_with_clip_uhd(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos, struct load_image_size * load_image_size) {
     // std::vector<clip_image_f32*> img_res_v; 
     // format VectN x H x W x RGB (N x 448 x 448 x 3)
     clip_image_f32 * img_res_v = clip_image_f32_init();
@@ -683,9 +686,10 @@ struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx *
             float* image_embed = NULL;
             int n_image_pos = 0;
             int patch_size=14;
-            std::pair<int, int> load_image_size;
-            load_image_size.first = imgs[i][j]->nx;
-            load_image_size.second = imgs[i][j]->ny;
+            struct load_image_size * load_image_size = load_image_size_init();
+            load_image_size->image_size_width = imgs[i][j]->nx;
+            load_image_size->image_size_height = imgs[i][j]->ny; 
+            LOG_TEE("%s : %d %d\n", __func__, load_image_size->image_size_width, load_image_size->image_size_height);
             bool image_embed_result = llava_image_embed_make_with_clip_img_uhd(ctx_clip, n_threads, only_v2_5_reshape_by_patch(imgs[i][j], patch_size), &image_embed, &n_image_pos, load_image_size);
             if (!image_embed_result) {
                 LOG_TEE("%s: coulnd't embed the image\n", __func__);
@@ -701,7 +705,7 @@ struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx *
     return results;
 }
 
-bool llava_image_embed_make_with_clip_img_uhd(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out, std::pair<int, int> load_image_size) {
+bool llava_image_embed_make_with_clip_img_uhd(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out, struct load_image_size * load_image_size) {
     float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model
     if (!image_embd) {
         LOG_TEE("Unable to allocate memory for image embeddings\n");
diff --git a/examples/llava/llava.h b/examples/llava/llava.h
index abb53d3d6..95fb42429 100644
--- a/examples/llava/llava.h
+++ b/examples/llava/llava.h
@@ -18,15 +18,13 @@
 #endif
 
 struct clip_ctx;
-
-struct uhd_image_embed {
-    std::vector<std::vector<struct llava_image_embed *>> image_embeds;
-};
+struct uhd_image_embed;
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+struct uhd_image_embed;
 struct llava_image_embed {
     float * embed;
     int n_image_pos;
@@ -47,7 +45,7 @@ LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
 /** build an image embed from image file bytes */
 LLAVA_API struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img);
 /** build an image embed from a path to an image filename */
-LLAVA_API bool llava_image_embed_make_with_clip_img_uhd(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out, std::pair<int, int> load_image_size = {448, 448});
+LLAVA_API bool llava_image_embed_make_with_clip_img_uhd(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out, struct load_image_size * load_image_size = nullptr);
 LLAVA_API bool llava_image_embed_make_with_clip_img_ollama(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
 LLAVA_API struct uhd_image_embed *  llava_image_embed_make_with_filename_uhd(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
 LLAVA_API void llava_image_embed_free_uhd(struct uhd_image_embed *  embed);
diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp
index befaec8bc..b68947116 100644
--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@@ -10,6 +10,10 @@
 #include <cstdlib>
 #include <vector>
 
+struct uhd_image_embed {
+    std::vector<std::vector<struct llava_image_embed *>> image_embeds;
+};
+
 static void show_additional_info(int /*argc*/, char ** argv) {
     LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
     LOG_TEE("  note: a lower temperature value like 0.1 is recommended for better quality.\n");
diff --git a/examples/llava/minicpmv_wrapper.cpp b/examples/llava/minicpmv_wrapper.cpp
index 5e1d9b134..3d72a4599 100644
--- a/examples/llava/minicpmv_wrapper.cpp
+++ b/examples/llava/minicpmv_wrapper.cpp
@@ -58,7 +58,7 @@ struct clip_ctx * clip_init_context(gpt_params * params) {
     if (prompt.empty()) {
         prompt = "describe the image in detail.";
     }
-    std::pair<int, int> load_image_size = std::make_pair(448, 448);
+    struct load_image_size * load_image_size = load_image_size_init();
     auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1, load_image_size);
     return ctx_clip;
 }