From 8f0350578d14047d33304dd9aecde07fd3f355c7 Mon Sep 17 00:00:00 2001 From: caitianchi Date: Tue, 25 Jun 2024 18:51:06 +0800 Subject: [PATCH] fix quality problem in pr code --- .gitignore | 1 + Makefile | 2 +- examples/llava/clip.cpp | 4 ++-- examples/llava/llava.cpp | 21 ++++++++++++++++++++- examples/llava/llava.h | 1 + 5 files changed, 25 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 6c3a0abf0..409c06593 100644 --- a/.gitignore +++ b/.gitignore @@ -117,6 +117,7 @@ poetry.toml /tests/test-tokenizer-0 /tests/test-tokenizer-1-bpe /tests/test-tokenizer-1-spm +/openbmb # Scripts !/scripts/install-oneapi.bat diff --git a/Makefile b/Makefile index 2ffee6338..1be0154ee 100644 --- a/Makefile +++ b/Makefile @@ -19,7 +19,7 @@ BUILD_TARGETS = \ llama-imatrix \ llama-infill \ llama-llava-cli \ - llama-minicpmv-cli\ + llama-minicpmv-cli\ llama-lookahead \ llama-lookup \ llama-lookup-create \ diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 9353f5a02..5b6f7aef3 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -647,7 +647,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 } // loop over layers - for (int il = 0; il < n_layer - 1; il++) { + for (int il = 0; il < n_layer; il++) { struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states //const size_t nb_q_w = model.layers[il].q_w->nb[0]; @@ -2077,7 +2077,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } // build the inference graph - ggml_cgraph * gf = clip_image_build_graph(ctx, imgs); + ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, load_image_size); ggml_gallocr_alloc_graph(ctx->compute_alloc, gf); // set inputs diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 0d7324037..93a5b0ea4 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -657,7 +657,7 @@ struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx * for (size_t j = 0; j < imgs[i].size(); ++j) { float* image_embed = NULL; int n_image_pos = 0; - bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, imgs[i][j], &image_embed, &n_image_pos); + bool image_embed_result = llava_image_embed_make_with_clip_img_uhd(ctx_clip, n_threads, imgs[i][j], &image_embed, &n_image_pos); if (!image_embed_result) { LOG_TEE("%s: coulnd't embed the image\n", __func__); return NULL; @@ -672,6 +672,25 @@ struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx * return results; } +bool llava_image_embed_make_with_clip_img_uhd(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) { + float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model + if (!image_embd) { + LOG_TEE("Unable to allocate memory for image embeddings\n"); + return false; + } + + int n_img_pos; + if (!encode_image_with_clip_uhd(ctx_clip, n_threads, img, image_embd, &n_img_pos)) { + LOG_TEE("%s: cannot encode image, aborting\n", __func__); + free(image_embd); + return false; + } + *image_embd_out = image_embd; + *n_img_pos_out = n_img_pos; + + return true; +} + bool llava_image_embed_make_with_clip_img_ollama(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) { auto embeds = llava_image_embed_make_with_bytes_uhd(ctx_clip, n_threads, img); auto image_embed_slices = embeds->image_embeds; diff --git a/examples/llava/llava.h b/examples/llava/llava.h index 420ae15d6..5f29f02c5 100644 --- a/examples/llava/llava.h +++ b/examples/llava/llava.h @@ -47,6 +47,7 @@ LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed); /** build an image embed from image file bytes */ LLAVA_API struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img); /** build an image embed from a path to an image filename */ +LLAVA_API bool llava_image_embed_make_with_clip_img_uhd(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out); LLAVA_API bool llava_image_embed_make_with_clip_img_ollama(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out); LLAVA_API struct uhd_image_embed * llava_image_embed_make_with_filename_uhd(struct clip_ctx * ctx_clip, int n_threads, const char * image_path); LLAVA_API void llava_image_embed_free_uhd(struct uhd_image_embed * embed);