diff --git a/.gitignore b/.gitignore index 409c06593..6c3a0abf0 100644 --- a/.gitignore +++ b/.gitignore @@ -117,7 +117,6 @@ poetry.toml /tests/test-tokenizer-0 /tests/test-tokenizer-1-bpe /tests/test-tokenizer-1-spm -/openbmb # Scripts !/scripts/install-oneapi.bat diff --git a/examples/llava/assets/xiaomi14pro_test.jpeg b/examples/llava/assets/xiaomi14pro_test.jpeg deleted file mode 100644 index 8762c9c7b..000000000 Binary files a/examples/llava/assets/xiaomi14pro_test.jpeg and /dev/null differ diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 056d0f2ad..33d61e3b1 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -554,7 +554,7 @@ struct clip_ctx { ggml_gallocr_t compute_alloc = NULL; }; -static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, std::pair load_image_size = {448, 448}, bool is_inf = false) { +static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct load_image_size * load_image_size, bool is_inf = false) { if (!ctx->has_vision_encoder) { LOG_TEE("This gguf file seems to have no vision encoder\n"); return nullptr; @@ -567,8 +567,12 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 int image_size_width = image_size; int image_size_height = image_size; if (ctx->has_minicpmv_projector) { - image_size_width = load_image_size.first; - image_size_height = load_image_size.second; + if(load_image_size==nullptr){ + load_image_size= load_image_size_init(); + } + LOG_TEE("%s : %d %d\n", __func__, load_image_size->image_size_width, load_image_size->image_size_height); + image_size_width = load_image_size->image_size_width; + image_size_height = load_image_size->image_size_height; if (is_inf){ image_size_width = imgs->data->nx; image_size_height = imgs->data->ny; @@ -995,7 +999,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 } // read and create ggml_context containing the tensors and their data -struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1, std::pair load_image_size) { +struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1, struct load_image_size * load_image_size) { struct ggml_context * meta = NULL; struct gguf_init_params params = { @@ -1464,6 +1468,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1, s return new_clip; } +struct load_image_size * load_image_size_init() { + struct load_image_size * load_image_size = new struct load_image_size(); + load_image_size->image_size_width = 448; + load_image_size->image_size_height = 448; + return load_image_size; +} + struct clip_image_u8 * clip_image_u8_init() { return new clip_image_u8(); } @@ -2058,7 +2069,7 @@ static std::vector> get_2d_sincos_pos_embed(int embed_dim, co return pos_embed_2d; } -bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec, std::pair load_image_size) { +bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec, struct load_image_size * load_image_size) { if (!ctx->has_vision_encoder) { LOG_TEE("This gguf file seems to have no vision encoder\n"); return false; @@ -2070,7 +2081,7 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3 return clip_image_batch_encode(ctx, n_threads, &imgs, vec, load_image_size); } -bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec, std::pair load_image_size) { +bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec, struct load_image_size * load_image_size) { if (!ctx->has_vision_encoder) { LOG_TEE("This gguf file seems to have no vision encoder\n"); return false; @@ -2148,8 +2159,12 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima // -> https://huggingface.co/Qwen/Qwen-VL/tree/main // -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23 struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed"); - int pos_w = load_image_size.first/patch_size; - int pos_h = load_image_size.second/patch_size; + if(load_image_size==nullptr){ + load_image_size= load_image_size_init(); + } + LOG_TEE("%s : %d %d\n", __func__, load_image_size->image_size_width, load_image_size->image_size_height); + int pos_w = load_image_size->image_size_width/patch_size; + int pos_h = load_image_size->image_size_height/patch_size; int embed_dim = 4096; auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h)); diff --git a/examples/llava/clip.h b/examples/llava/clip.h index 232fe50a8..da6f5d2b7 100644 --- a/examples/llava/clip.h +++ b/examples/llava/clip.h @@ -3,7 +3,6 @@ #include #include -#include #ifdef LLAMA_SHARED # if defined(_WIN32) && !defined(__MINGW32__) @@ -27,6 +26,10 @@ extern "C" { struct clip_ctx; +struct load_image_size { + int image_size_width; + int image_size_height; +}; struct clip_image_u8_batch { struct clip_image_u8 * data; size_t size; @@ -37,7 +40,7 @@ struct clip_image_f32_batch { size_t size; }; -CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity, std::pair load_image_size = {448, 448}); +CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity, struct load_image_size * load_image_size = nullptr); CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity); CLIP_API void clip_free(struct clip_ctx * ctx); @@ -56,6 +59,7 @@ CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx); CLIP_API int clip_n_patches (const struct clip_ctx * ctx); CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx); +CLIP_API struct load_image_size * load_image_size_init(); CLIP_API struct clip_image_u8 * clip_image_u8_init (); CLIP_API struct clip_image_f32 * clip_image_f32_init(); @@ -76,8 +80,8 @@ CLIP_API void uhd_normalize_image_u8_to_f32(struct clip_ctx * ctx, const clip_im CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx); -CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec, std::pair load_image_size = {448, 448}); -CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec, std::pair load_image_size = {448, 448}); +CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec, struct load_image_size * load_image_size = nullptr); +CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec, struct load_image_size * load_image_size = nullptr); CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype); diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index dc9a01a45..1730faa8e 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -31,6 +31,9 @@ struct clip_image_grid_shape { int second; }; +struct uhd_image_embed { + std::vector> image_embeds; +}; /** * Selects the best resolution from a list of possible resolutions based on the original size. * @@ -410,7 +413,7 @@ void llava_image_embed_free(struct llava_image_embed * embed) { free(embed); } -static bool encode_image_with_clip_uhd(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos, std::pair load_image_size) { +static bool encode_image_with_clip_uhd(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos, struct load_image_size * load_image_size) { // std::vector img_res_v; // format VectN x H x W x RGB (N x 448 x 448 x 3) clip_image_f32 * img_res_v = clip_image_f32_init(); @@ -683,9 +686,10 @@ struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx * float* image_embed = NULL; int n_image_pos = 0; int patch_size=14; - std::pair load_image_size; - load_image_size.first = imgs[i][j]->nx; - load_image_size.second = imgs[i][j]->ny; + struct load_image_size * load_image_size = load_image_size_init(); + load_image_size->image_size_width = imgs[i][j]->nx; + load_image_size->image_size_height = imgs[i][j]->ny; + LOG_TEE("%s : %d %d\n", __func__, load_image_size->image_size_width, load_image_size->image_size_height); bool image_embed_result = llava_image_embed_make_with_clip_img_uhd(ctx_clip, n_threads, only_v2_5_reshape_by_patch(imgs[i][j], patch_size), &image_embed, &n_image_pos, load_image_size); if (!image_embed_result) { LOG_TEE("%s: coulnd't embed the image\n", __func__); @@ -701,7 +705,7 @@ struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx * return results; } -bool llava_image_embed_make_with_clip_img_uhd(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out, std::pair load_image_size) { +bool llava_image_embed_make_with_clip_img_uhd(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out, struct load_image_size * load_image_size) { float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model if (!image_embd) { LOG_TEE("Unable to allocate memory for image embeddings\n"); diff --git a/examples/llava/llava.h b/examples/llava/llava.h index abb53d3d6..95fb42429 100644 --- a/examples/llava/llava.h +++ b/examples/llava/llava.h @@ -18,15 +18,13 @@ #endif struct clip_ctx; - -struct uhd_image_embed { - std::vector> image_embeds; -}; +struct uhd_image_embed; #ifdef __cplusplus extern "C" { #endif +struct uhd_image_embed; struct llava_image_embed { float * embed; int n_image_pos; @@ -47,7 +45,7 @@ LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed); /** build an image embed from image file bytes */ LLAVA_API struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img); /** build an image embed from a path to an image filename */ -LLAVA_API bool llava_image_embed_make_with_clip_img_uhd(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out, std::pair load_image_size = {448, 448}); +LLAVA_API bool llava_image_embed_make_with_clip_img_uhd(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out, struct load_image_size * load_image_size = nullptr); LLAVA_API bool llava_image_embed_make_with_clip_img_ollama(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out); LLAVA_API struct uhd_image_embed * llava_image_embed_make_with_filename_uhd(struct clip_ctx * ctx_clip, int n_threads, const char * image_path); LLAVA_API void llava_image_embed_free_uhd(struct uhd_image_embed * embed); diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp index befaec8bc..b68947116 100644 --- a/examples/llava/minicpmv-cli.cpp +++ b/examples/llava/minicpmv-cli.cpp @@ -10,6 +10,10 @@ #include #include +struct uhd_image_embed { + std::vector> image_embeds; +}; + static void show_additional_info(int /*argc*/, char ** argv) { LOG_TEE("\n example usage: %s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n"); diff --git a/examples/llava/minicpmv_wrapper.cpp b/examples/llava/minicpmv_wrapper.cpp index 5e1d9b134..3d72a4599 100644 --- a/examples/llava/minicpmv_wrapper.cpp +++ b/examples/llava/minicpmv_wrapper.cpp @@ -58,7 +58,7 @@ struct clip_ctx * clip_init_context(gpt_params * params) { if (prompt.empty()) { prompt = "describe the image in detail."; } - std::pair load_image_size = std::make_pair(448, 448); + struct load_image_size * load_image_size = load_image_size_init(); auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1, load_image_size); return ctx_clip; }