mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-12 05:17:21 +01:00
remove load_image_size into clip_ctx
This commit is contained in:
parent
3642be9937
commit
fcde997126
@ -550,6 +550,8 @@ struct clip_ctx {
|
|||||||
|
|
||||||
ggml_backend_t backend = NULL;
|
ggml_backend_t backend = NULL;
|
||||||
ggml_gallocr_t compute_alloc = NULL;
|
ggml_gallocr_t compute_alloc = NULL;
|
||||||
|
|
||||||
|
struct clip_image_size * load_image_size;
|
||||||
};
|
};
|
||||||
|
|
||||||
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
|
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
|
||||||
@ -996,7 +998,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||||||
}
|
}
|
||||||
|
|
||||||
// read and create ggml_context containing the tensors and their data
|
// read and create ggml_context containing the tensors and their data
|
||||||
struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1, struct clip_image_size * load_image_size) {
|
struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
struct ggml_context * meta = NULL;
|
struct ggml_context * meta = NULL;
|
||||||
|
|
||||||
struct gguf_init_params params = {
|
struct gguf_init_params params = {
|
||||||
@ -1456,7 +1458,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1, s
|
|||||||
new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
|
new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
|
||||||
clip_image_f32_batch batch;
|
clip_image_f32_batch batch;
|
||||||
batch.size = 1;
|
batch.size = 1;
|
||||||
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, load_image_size, false);
|
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
|
||||||
ggml_gallocr_reserve(new_clip->compute_alloc, gf);
|
ggml_gallocr_reserve(new_clip->compute_alloc, gf);
|
||||||
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
|
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
|
||||||
LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
|
LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
|
||||||
@ -1465,6 +1467,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1, s
|
|||||||
return new_clip;
|
return new_clip;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size){
|
||||||
|
ctx_clip->load_image_size = load_image_size;
|
||||||
|
}
|
||||||
|
|
||||||
struct clip_image_size * clip_image_size_init() {
|
struct clip_image_size * clip_image_size_init() {
|
||||||
struct clip_image_size * load_image_size = new struct clip_image_size();
|
struct clip_image_size * load_image_size = new struct clip_image_size();
|
||||||
load_image_size->width = 448;
|
load_image_size->width = 448;
|
||||||
@ -2066,7 +2072,7 @@ static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, co
|
|||||||
return pos_embed_2d;
|
return pos_embed_2d;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec, struct clip_image_size * load_image_size) {
|
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
|
||||||
if (!ctx->has_vision_encoder) {
|
if (!ctx->has_vision_encoder) {
|
||||||
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
||||||
return false;
|
return false;
|
||||||
@ -2075,10 +2081,10 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
|
|||||||
clip_image_f32_batch imgs{};
|
clip_image_f32_batch imgs{};
|
||||||
imgs.size = 1;
|
imgs.size = 1;
|
||||||
imgs.data = img;
|
imgs.data = img;
|
||||||
return clip_image_batch_encode(ctx, n_threads, &imgs, vec, load_image_size);
|
return clip_image_batch_encode(ctx, n_threads, &imgs, vec);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec, struct clip_image_size * load_image_size) {
|
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
|
||||||
if (!ctx->has_vision_encoder) {
|
if (!ctx->has_vision_encoder) {
|
||||||
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
||||||
return false;
|
return false;
|
||||||
@ -2093,7 +2099,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|||||||
}
|
}
|
||||||
|
|
||||||
// build the inference graph
|
// build the inference graph
|
||||||
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, load_image_size, true);
|
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true);
|
||||||
ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
|
ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
|
||||||
|
|
||||||
// set inputs
|
// set inputs
|
||||||
@ -2156,12 +2162,12 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|||||||
// -> https://huggingface.co/Qwen/Qwen-VL/tree/main
|
// -> https://huggingface.co/Qwen/Qwen-VL/tree/main
|
||||||
// -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
|
// -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
|
||||||
struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed");
|
struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed");
|
||||||
if(load_image_size==nullptr){
|
if(ctx->load_image_size==nullptr){
|
||||||
load_image_size= clip_image_size_init();
|
ctx->load_image_size= clip_image_size_init();
|
||||||
}
|
}
|
||||||
LOG_TEE("%s : %d %d\n", __func__, load_image_size->width, load_image_size->height);
|
LOG_TEE("%s : %d %d\n", __func__, ctx->load_image_size->width, ctx->load_image_size->height);
|
||||||
int pos_w = load_image_size->width/patch_size;
|
int pos_w = ctx->load_image_size->width/patch_size;
|
||||||
int pos_h = load_image_size->height/patch_size;
|
int pos_h = ctx->load_image_size->height/patch_size;
|
||||||
int embed_dim = 4096;
|
int embed_dim = 4096;
|
||||||
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
|
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
|
||||||
|
|
||||||
|
@ -40,7 +40,7 @@ struct clip_image_f32_batch {
|
|||||||
size_t size;
|
size_t size;
|
||||||
};
|
};
|
||||||
|
|
||||||
CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity, struct clip_image_size * load_image_size);
|
CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity);
|
||||||
CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);
|
CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);
|
||||||
|
|
||||||
CLIP_API void clip_free(struct clip_ctx * ctx);
|
CLIP_API void clip_free(struct clip_ctx * ctx);
|
||||||
@ -59,6 +59,7 @@ CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
|
|||||||
CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
|
CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
|
||||||
CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
|
CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
|
||||||
|
|
||||||
|
CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
|
||||||
CLIP_API struct clip_image_size * clip_image_size_init();
|
CLIP_API struct clip_image_size * clip_image_size_init();
|
||||||
CLIP_API struct clip_image_u8 * clip_image_u8_init ();
|
CLIP_API struct clip_image_u8 * clip_image_u8_init ();
|
||||||
CLIP_API struct clip_image_f32 * clip_image_f32_init();
|
CLIP_API struct clip_image_f32 * clip_image_f32_init();
|
||||||
@ -80,8 +81,8 @@ CLIP_API void uhd_normalize_image_u8_to_f32(struct clip_ctx * ctx, const clip_im
|
|||||||
|
|
||||||
CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
|
CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
|
||||||
|
|
||||||
CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec, struct clip_image_size * load_image_size);
|
CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
|
||||||
CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec, struct clip_image_size * load_image_size);
|
CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
|
||||||
|
|
||||||
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
|
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
|
||||||
|
|
||||||
|
@ -413,7 +413,7 @@ void llava_image_embed_free(struct llava_image_embed * embed) {
|
|||||||
free(embed);
|
free(embed);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool encode_image_with_clip_uhd(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos, struct clip_image_size * load_image_size) {
|
static bool encode_image_with_clip_uhd(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
|
||||||
// std::vector<clip_image_f32*> img_res_v;
|
// std::vector<clip_image_f32*> img_res_v;
|
||||||
// format VectN x H x W x RGB (N x 448 x 448 x 3)
|
// format VectN x H x W x RGB (N x 448 x 448 x 3)
|
||||||
clip_image_f32 * img_res_v = clip_image_f32_init();
|
clip_image_f32 * img_res_v = clip_image_f32_init();
|
||||||
@ -425,7 +425,7 @@ static bool encode_image_with_clip_uhd(clip_ctx * ctx_clip, int n_threads, const
|
|||||||
LOG_TEE("\n%s: mm_patch_merge_type is %s.\n", __func__, mm_patch_merge_type);
|
LOG_TEE("\n%s: mm_patch_merge_type is %s.\n", __func__, mm_patch_merge_type);
|
||||||
|
|
||||||
*n_img_pos = clip_n_patches(ctx_clip);
|
*n_img_pos = clip_n_patches(ctx_clip);
|
||||||
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res_v, image_embd, load_image_size); // image_embd shape is 96 x 4096
|
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res_v, image_embd); // image_embd shape is 96 x 4096
|
||||||
if (!encoded) {
|
if (!encoded) {
|
||||||
LOG_TEE("Unable to encode image\n");
|
LOG_TEE("Unable to encode image\n");
|
||||||
return false;
|
return false;
|
||||||
@ -690,7 +690,8 @@ struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx *
|
|||||||
load_image_size->width = imgs[i][j]->nx;
|
load_image_size->width = imgs[i][j]->nx;
|
||||||
load_image_size->height = imgs[i][j]->ny;
|
load_image_size->height = imgs[i][j]->ny;
|
||||||
LOG_TEE("%s : %d %d\n", __func__, load_image_size->width, load_image_size->height);
|
LOG_TEE("%s : %d %d\n", __func__, load_image_size->width, load_image_size->height);
|
||||||
bool image_embed_result = llava_image_embed_make_with_clip_img_uhd(ctx_clip, n_threads, only_v2_5_reshape_by_patch(imgs[i][j], patch_size), &image_embed, &n_image_pos, load_image_size);
|
clip_add_load_image_size(ctx_clip, load_image_size);
|
||||||
|
bool image_embed_result = llava_image_embed_make_with_clip_img_uhd(ctx_clip, n_threads, only_v2_5_reshape_by_patch(imgs[i][j], patch_size), &image_embed, &n_image_pos);
|
||||||
if (!image_embed_result) {
|
if (!image_embed_result) {
|
||||||
LOG_TEE("%s: coulnd't embed the image\n", __func__);
|
LOG_TEE("%s: coulnd't embed the image\n", __func__);
|
||||||
return NULL;
|
return NULL;
|
||||||
@ -705,7 +706,7 @@ struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx *
|
|||||||
return results;
|
return results;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llava_image_embed_make_with_clip_img_uhd(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out, struct clip_image_size * load_image_size) {
|
bool llava_image_embed_make_with_clip_img_uhd(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
|
||||||
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model
|
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model
|
||||||
if (!image_embd) {
|
if (!image_embd) {
|
||||||
LOG_TEE("Unable to allocate memory for image embeddings\n");
|
LOG_TEE("Unable to allocate memory for image embeddings\n");
|
||||||
@ -713,7 +714,7 @@ bool llava_image_embed_make_with_clip_img_uhd(clip_ctx * ctx_clip, int n_threads
|
|||||||
}
|
}
|
||||||
|
|
||||||
int n_img_pos;
|
int n_img_pos;
|
||||||
if (!encode_image_with_clip_uhd(ctx_clip, n_threads, img, image_embd, &n_img_pos, load_image_size)) {
|
if (!encode_image_with_clip_uhd(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
|
||||||
LOG_TEE("%s: cannot encode image, aborting\n", __func__);
|
LOG_TEE("%s: cannot encode image, aborting\n", __func__);
|
||||||
free(image_embd);
|
free(image_embd);
|
||||||
return false;
|
return false;
|
||||||
|
@ -45,7 +45,7 @@ LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
|
|||||||
/** build an image embed from image file bytes */
|
/** build an image embed from image file bytes */
|
||||||
LLAVA_API struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img);
|
LLAVA_API struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img);
|
||||||
/** build an image embed from a path to an image filename */
|
/** build an image embed from a path to an image filename */
|
||||||
LLAVA_API bool llava_image_embed_make_with_clip_img_uhd(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out, struct clip_image_size * load_image_size);
|
LLAVA_API bool llava_image_embed_make_with_clip_img_uhd(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
|
||||||
LLAVA_API struct uhd_image_embed * llava_image_embed_make_with_filename_uhd(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
|
LLAVA_API struct uhd_image_embed * llava_image_embed_make_with_filename_uhd(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
|
||||||
LLAVA_API void llava_image_embed_free_uhd(struct uhd_image_embed * embed);
|
LLAVA_API void llava_image_embed_free_uhd(struct uhd_image_embed * embed);
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user