diff --git a/examples/minicpmv/clip.cpp b/examples/minicpmv/clip.cpp index a0523fd94..b10e9b0c0 100644 --- a/examples/minicpmv/clip.cpp +++ b/examples/minicpmv/clip.cpp @@ -577,7 +577,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 /*.no_alloc =*/ true, }; - LOG_TEE("%s: ctx->buf_compute_meta.size(): %d \n", __func__, ctx->buf_compute_meta.size()); struct ggml_context * ctx0 = ggml_init(params); struct ggml_cgraph * gf = ggml_new_graph(ctx0); @@ -1446,7 +1445,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length return true; } -static void normalize_image_u8_to_f32(struct clip_ctx * ctx, const clip_image_u8* src, clip_image_f32* dst) { +void normalize_image_u8_to_f32(struct clip_ctx * ctx, const clip_image_u8* src, clip_image_f32* dst) { dst->nx = src->nx; dst->ny = src->ny; dst->buf.resize(src->buf.size()); @@ -1511,7 +1510,7 @@ int clip_n_patches(const struct clip_ctx * ctx) { return n_patches; } -std::vector>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector>& pos) { +static std::vector>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector>& pos) { assert(embed_dim % 2 == 0); int H = pos.size(); int W = pos[0].size(); @@ -1535,7 +1534,7 @@ std::vector>> get_1d_sincos_pos_embed_from_grid_n return emb; } -std::vector>> get_2d_sincos_pos_embed_from_grid(int embed_dim, const std::vector>>& grid) { +static std::vector>> get_2d_sincos_pos_embed_from_grid(int embed_dim, const std::vector>>& grid) { assert(embed_dim % 2 == 0); std::vector>> emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[0]); // (H, W, D/2) std::vector>> emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[1]); // (H, W, D/2) @@ -1555,7 +1554,7 @@ std::vector>> get_2d_sincos_pos_embed_from_grid(i return emb; } -std::vector> get_2d_sincos_pos_embed(int embed_dim, const std::pair image_size) { +static std::vector> get_2d_sincos_pos_embed(int embed_dim, const std::pair image_size) { int grid_h_size = image_size.first; int grid_w_size = image_size.second; diff --git a/examples/minicpmv/clip.h b/examples/minicpmv/clip.h index aae4c7c3a..3b5aca231 100644 --- a/examples/minicpmv/clip.h +++ b/examples/minicpmv/clip.h @@ -69,7 +69,7 @@ CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 /** interpret bytes as an image file with length bytes_length, and use the result to populate img */ CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img); -static void normalize_image_u8_to_f32(struct clip_ctx * ctx, const clip_image_u8* src, clip_image_f32* dst); +CLIP_API void normalize_image_u8_to_f32(struct clip_ctx * ctx, const clip_image_u8* src, clip_image_f32* dst); CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx); diff --git a/examples/minicpmv/minicpmv-cli.cpp b/examples/minicpmv/minicpmv-cli.cpp index 281b8d097..5ba515bb3 100644 --- a/examples/minicpmv/minicpmv-cli.cpp +++ b/examples/minicpmv/minicpmv-cli.cpp @@ -21,8 +21,9 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v LOG_TEE("%s", text); } -struct minicpmv_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){ - auto image_embed_slices = minicpmv_image_embed(params, fname); +static struct minicpmv_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){ + auto embeds = minicpmv_image_embed(params, fname); + auto image_embed_slices = embeds->image_embeds; if (!image_embed_slices[0][0]) { std::cerr << "error: failed to load image " << fname << ". Terminating\n\n"; return NULL; @@ -52,14 +53,13 @@ struct minicpmv_context * minicpmv_init(gpt_params * params, const std::string & float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0; LOG_TEE("\n%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms); - llava_image_embed_free_slice(image_embed_slices); + llava_image_embed_free_uhd(embeds); return ctx_llava; } -struct llama_sampling_context * llama_init(struct minicpmv_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){ +static struct llama_sampling_context * llama_init(struct minicpmv_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){ std::string user_prompt = prompt; if (!is_first) user_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + prompt; - const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict; eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false); eval_string(ctx_llava->ctx_llama, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false); @@ -71,7 +71,7 @@ struct llama_sampling_context * llama_init(struct minicpmv_context * ctx_llava, return ctx_sampling; } -const char * llama_loop(struct minicpmv_context * ctx_llava,struct llama_sampling_context * ctx_sampling, int &n_past){ +static const char * llama_loop(struct minicpmv_context * ctx_llava,struct llama_sampling_context * ctx_sampling, int &n_past){ const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past); return tmp; diff --git a/examples/minicpmv/minicpmv.cpp b/examples/minicpmv/minicpmv.cpp index 9dbe577c9..611d4a7f0 100644 --- a/examples/minicpmv/minicpmv.cpp +++ b/examples/minicpmv/minicpmv.cpp @@ -108,11 +108,11 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_ return true; } -int ensure_divide(int length, int patch_size) { +static int ensure_divide(int length, int patch_size) { return std::max(static_cast(std::round(static_cast(length) / patch_size) * patch_size), patch_size); } -std::pair uhd_find_best_resize(std::pair original_size, int scale_resolution, int patch_size, bool allow_upscale = false) { +static std::pair uhd_find_best_resize(std::pair original_size, int scale_resolution, int patch_size, bool allow_upscale = false) { int width = original_size.first; int height = original_size.second; if ((width * height > scale_resolution * scale_resolution) || allow_upscale) { @@ -129,7 +129,7 @@ inline float clip(float x, float lower, float upper) { return std::max(lower, std::min(x, upper)); } -std::pair uhd_get_refine_size(std::pair original_size, std::pair grid, int scale_resolution, int patch_size, bool allow_upscale = false) { +static std::pair uhd_get_refine_size(std::pair original_size, std::pair grid, int scale_resolution, int patch_size, bool allow_upscale = false) { int width, height; std::tie(width, height) = original_size; int grid_x, grid_y; @@ -218,7 +218,7 @@ static bool bicubic_resize(const clip_image_u8 &img, clip_image_u8 &dst, int tar // -> https://arxiv.org/pdf/2403.11703 // -> https://github.com/thunlp/LLaVA-UHD // -> https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118 -std::vector> uhd_slice_image(const clip_image_u8 * img, const int max_slice_nums=9, const int scale_resolution=448, const int patch_size=14, const bool never_split=false) { +static std::vector> uhd_slice_image(const clip_image_u8 * img, const int max_slice_nums=9, const int scale_resolution=448, const int patch_size=14) { const std::pair original_size={img->nx,img->ny}; const int original_width = img->nx; const int original_height = img->ny; @@ -311,30 +311,30 @@ std::vector> uhd_slice_image(const clip_image_u8 * return images; } -std::vector> llava_image_embed_make_with_bytes_uhd(struct clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img) { +struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img) { std::vector> imgs = uhd_slice_image(img); for (size_t i = 0; i < imgs.size(); ++i){ for (size_t j = 0; j < imgs[i].size(); ++j) { LOG_TEE("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny); } } - std::vector> results; + struct uhd_image_embed * results = new uhd_image_embed(); for (size_t i = 0; i < imgs.size(); ++i){ - results.push_back(std::vector()); + results->image_embeds.push_back(std::vector()); for (size_t j = 0; j < imgs[i].size(); ++j) { float* image_embed = NULL; int n_image_pos = 0; bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, imgs[i][j], &image_embed, &n_image_pos); if (!image_embed_result) { LOG_TEE("%s: coulnd't embed the image\n", __func__); - return std::vector>(); + return NULL; } auto result = (llava_image_embed*)malloc(sizeof(llava_image_embed)); result->embed = image_embed; result->n_image_pos = n_image_pos; - results[i].push_back(result); + results->image_embeds[i].push_back(result); } } return results; @@ -374,7 +374,8 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long } bool llava_image_embed_make_with_clip_img_ollama(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) { - auto image_embed_slices = llava_image_embed_make_with_bytes_uhd(ctx_clip, n_threads, img); + auto embeds = llava_image_embed_make_with_bytes_uhd(ctx_clip, n_threads, img); + auto image_embed_slices = embeds->image_embeds; if (!image_embed_slices[0][0]){ LOG_TEE("%s: failed to embeding image\n", __func__); return false; @@ -416,35 +417,35 @@ bool llava_image_embed_make_with_clip_img_ollama(clip_ctx * ctx_clip, int n_thre return true; } -std::vector> llava_image_embed_make_with_filename_uhd(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) { +struct uhd_image_embed * llava_image_embed_make_with_filename_uhd(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) { unsigned char* image_bytes; long image_bytes_length; auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length); if (!loaded) { LOG_TEE("%s: failed to load %s\n", __func__, image_path); - return std::vector>(); + return NULL; } clip_image_u8 * img = clip_image_u8_init(); if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) { clip_image_u8_free(img); LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__); - return std::vector>(); + return NULL; } - std::vector> embeds = llava_image_embed_make_with_bytes_uhd(ctx_clip, n_threads, img); + struct uhd_image_embed * embeds = llava_image_embed_make_with_bytes_uhd(ctx_clip, n_threads, img); clip_image_u8_free(img); free(image_bytes); return embeds; } -void llava_image_embed_free_uhd(std::vector> embed) { - for (size_t i = 0; i < embed.size(); ++i){ - for (size_t j = 0; j < embed[i].size(); ++j){ - free(embed[i][j]->embed); - free(embed[i][j]); +void llava_image_embed_free_uhd(struct uhd_image_embed * embed) { + for (size_t i = 0; i < embed->image_embeds.size(); ++i){ + for (size_t j = 0; j < embed->image_embeds[i].size(); ++j){ + free(embed->image_embeds[i][j]->embed); + free(embed->image_embeds[i][j]->embed); } - embed[i] = std::vector(); + embed->image_embeds[i] = std::vector(); } - embed = std::vector>(); + embed->image_embeds = std::vector>(); } \ No newline at end of file diff --git a/examples/minicpmv/minicpmv.h b/examples/minicpmv/minicpmv.h index dd360920a..337549338 100644 --- a/examples/minicpmv/minicpmv.h +++ b/examples/minicpmv/minicpmv.h @@ -18,6 +18,9 @@ #endif struct clip_ctx; +struct uhd_image_embed { + std::vector> image_embeds; +}; #ifdef __cplusplus extern "C" { @@ -34,11 +37,11 @@ MINICPMV_API bool llava_validate_embed_size(const struct llama_context * ctx_lla MINICPMV_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out); /** build an image embed from image file bytes */ -MINICPMV_API std::vector> llava_image_embed_make_with_bytes_uhd(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length); +MINICPMV_API struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img); /** build an image embed from a path to an image filename */ MINICPMV_API bool llava_image_embed_make_with_clip_img_ollama(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out); -MINICPMV_API std::vector> llava_image_embed_make_with_filename_uhd(struct clip_ctx * ctx_clip, int n_threads, const char * image_path); -MINICPMV_API void llava_image_embed_free_uhd(std::vector> embed); +MINICPMV_API struct uhd_image_embed * llava_image_embed_make_with_filename_uhd(struct clip_ctx * ctx_clip, int n_threads, const char * image_path); +MINICPMV_API void llava_image_embed_free_uhd(struct uhd_image_embed * embed); /** free an embedding made with llava_image_embed_make_* */ /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */ diff --git a/examples/minicpmv/minicpmv_wrapper.cpp b/examples/minicpmv/minicpmv_wrapper.cpp index 5e3a572a8..0e8c5eb83 100644 --- a/examples/minicpmv/minicpmv_wrapper.cpp +++ b/examples/minicpmv/minicpmv_wrapper.cpp @@ -23,8 +23,6 @@ struct llama_model * llava_init(gpt_params * params) { } struct minicpmv_context * llava_init_context(gpt_params * params, llama_model * model) { - const char * clip_path = params->mmproj.c_str(); - auto prompt = params->prompt; if (prompt.empty()) { prompt = "describe the image in detail."; @@ -65,9 +63,9 @@ struct clip_ctx * clip_init_context(gpt_params * params) { return ctx_clip; } -std::vector> minicpmv_image_embed(gpt_params * params, const std::string & fname){ +struct uhd_image_embed * minicpmv_image_embed(gpt_params * params, const std::string & fname){ auto ctx_clip = clip_init_context(params); - auto image_embed_and_slices = llava_image_embed_make_with_filename_slice(ctx_clip, params->n_threads, fname.c_str()); + auto image_embed_and_slices = llava_image_embed_make_with_filename_uhd(ctx_clip, params->n_threads, fname.c_str()); if (ctx_clip) { clip_free(ctx_clip); ctx_clip = NULL; diff --git a/examples/minicpmv/minicpmv_wrapper.h b/examples/minicpmv/minicpmv_wrapper.h index b3631ed16..f6c7b3f14 100644 --- a/examples/minicpmv/minicpmv_wrapper.h +++ b/examples/minicpmv/minicpmv_wrapper.h @@ -34,7 +34,7 @@ MINICPMV_API struct minicpmv_context * llava_init_context(gpt_params * params, l MINICPMV_API void llava_free(struct minicpmv_context * ctx_llava); MINICPMV_API struct clip_ctx * clip_init_context(gpt_params * params); -MINICPMV_API std::vector> minicpmv_image_embed(gpt_params * params, const std::string & fname); +MINICPMV_API struct uhd_image_embed * minicpmv_image_embed(gpt_params * params, const std::string & fname); MINICPMV_API bool eval_tokens(struct llama_context * ctx_llama, std::vector tokens, int n_batch, int * n_past); MINICPMV_API bool eval_id(struct llama_context * ctx_llama, int id, int * n_past);