mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-26 12:21:40 +01:00
imitate reshape bug of python code
This commit is contained in:
parent
4c67d7cef5
commit
977941d9fe
@ -554,7 +554,7 @@ struct clip_ctx {
|
|||||||
ggml_gallocr_t compute_alloc = NULL;
|
ggml_gallocr_t compute_alloc = NULL;
|
||||||
};
|
};
|
||||||
|
|
||||||
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, std::pair<int, int> load_image_size = {448, 448}) {
|
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, std::pair<int, int> load_image_size = {448, 448}, bool is_inf = false) {
|
||||||
if (!ctx->has_vision_encoder) {
|
if (!ctx->has_vision_encoder) {
|
||||||
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
||||||
return nullptr;
|
return nullptr;
|
||||||
@ -569,6 +569,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||||||
if (ctx->has_minicpmv_projector) {
|
if (ctx->has_minicpmv_projector) {
|
||||||
image_size_width = load_image_size.first;
|
image_size_width = load_image_size.first;
|
||||||
image_size_height = load_image_size.second;
|
image_size_height = load_image_size.second;
|
||||||
|
if (is_inf){
|
||||||
|
image_size_width = imgs->data->nx;
|
||||||
|
image_size_height = imgs->data->ny;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
const int patch_size = hparams.patch_size;
|
const int patch_size = hparams.patch_size;
|
||||||
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
||||||
@ -762,7 +766,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||||||
embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
|
embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
|
||||||
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
|
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
|
||||||
|
|
||||||
} else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
|
}
|
||||||
|
else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
|
||||||
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
||||||
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
|
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
|
||||||
// ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
|
// ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
|
||||||
@ -1450,7 +1455,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1, s
|
|||||||
new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
|
new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
|
||||||
clip_image_f32_batch batch;
|
clip_image_f32_batch batch;
|
||||||
batch.size = 1;
|
batch.size = 1;
|
||||||
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, load_image_size);
|
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, load_image_size, false);
|
||||||
ggml_gallocr_reserve(new_clip->compute_alloc, gf);
|
ggml_gallocr_reserve(new_clip->compute_alloc, gf);
|
||||||
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
|
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
|
||||||
LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
|
LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
|
||||||
@ -2080,7 +2085,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|||||||
}
|
}
|
||||||
|
|
||||||
// build the inference graph
|
// build the inference graph
|
||||||
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, load_image_size);
|
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, load_image_size, true);
|
||||||
ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
|
ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
|
||||||
|
|
||||||
// set inputs
|
// set inputs
|
||||||
@ -2091,8 +2096,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|||||||
int image_size_width = image_size;
|
int image_size_width = image_size;
|
||||||
int image_size_height = image_size;
|
int image_size_height = image_size;
|
||||||
if (ctx->has_minicpmv_projector) {
|
if (ctx->has_minicpmv_projector) {
|
||||||
image_size_width = load_image_size.first;
|
image_size_width = imgs->data[0].nx;;
|
||||||
image_size_height = load_image_size.second;
|
image_size_height = imgs->data[0].ny;
|
||||||
}
|
}
|
||||||
const int patch_size = hparams.patch_size;
|
const int patch_size = hparams.patch_size;
|
||||||
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
||||||
@ -2144,8 +2149,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|||||||
// -> https://huggingface.co/Qwen/Qwen-VL/tree/main
|
// -> https://huggingface.co/Qwen/Qwen-VL/tree/main
|
||||||
// -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
|
// -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
|
||||||
struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed");
|
struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed");
|
||||||
int pos_w = image_size_width/patch_size;
|
int pos_w = load_image_size.first/patch_size;
|
||||||
int pos_h = image_size_height/patch_size;
|
int pos_h = load_image_size.second/patch_size;
|
||||||
int embed_dim = 4096;
|
int embed_dim = 4096;
|
||||||
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
|
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
|
||||||
|
|
||||||
|
@ -410,13 +410,10 @@ void llava_image_embed_free(struct llava_image_embed * embed) {
|
|||||||
free(embed);
|
free(embed);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool encode_image_with_clip_uhd(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
|
static bool encode_image_with_clip_uhd(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos, std::pair<int, int> load_image_size) {
|
||||||
// std::vector<clip_image_f32*> img_res_v;
|
// std::vector<clip_image_f32*> img_res_v;
|
||||||
// format VectN x H x W x RGB (N x 448 x 448 x 3)
|
// format VectN x H x W x RGB (N x 448 x 448 x 3)
|
||||||
clip_image_f32 * img_res_v = clip_image_f32_init();
|
clip_image_f32 * img_res_v = clip_image_f32_init();
|
||||||
std::pair<int, int> load_image_size;
|
|
||||||
load_image_size.first = img->nx;
|
|
||||||
load_image_size.second = img->ny;
|
|
||||||
uhd_normalize_image_u8_to_f32(ctx_clip, img, img_res_v);
|
uhd_normalize_image_u8_to_f32(ctx_clip, img, img_res_v);
|
||||||
|
|
||||||
const int64_t t_img_enc_start_us = ggml_time_us();
|
const int64_t t_img_enc_start_us = ggml_time_us();
|
||||||
@ -545,6 +542,34 @@ static bool bicubic_resize(const clip_image_u8 &img, clip_image_u8 &dst, int tar
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static clip_image_u8 * only_v2_5_reshape_by_patch(clip_image_u8 * image, int patch_size) {
|
||||||
|
int width = image->nx;
|
||||||
|
int height = image->ny;
|
||||||
|
int num_patches = (height / patch_size) * (width / patch_size);
|
||||||
|
clip_image_u8 * patch = clip_image_u8_init();
|
||||||
|
patch->nx = patch_size * num_patches;
|
||||||
|
patch->ny = patch_size;
|
||||||
|
patch->buf.resize(3 * patch->nx * patch->ny);
|
||||||
|
|
||||||
|
int patch_index = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < height; i += patch_size) {
|
||||||
|
for (int j = 0; j < width; j += patch_size) {
|
||||||
|
for (int pi = 0; pi < patch_size; ++pi) {
|
||||||
|
for (int pj = 0; pj < patch_size; ++pj) {
|
||||||
|
int input_index = ((i + pi) * width + (j + pj)) * 3;
|
||||||
|
int output_index = (pi * patch_size * num_patches + patch_index * patch_size + pj) * 3;
|
||||||
|
patch->buf[output_index] = image->buf[input_index];
|
||||||
|
patch->buf[output_index+1] = image->buf[input_index+1];
|
||||||
|
patch->buf[output_index+2] = image->buf[input_index+2];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
patch_index++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return patch;
|
||||||
|
}
|
||||||
|
|
||||||
// inspired from LLaVA-UHD:
|
// inspired from LLaVA-UHD:
|
||||||
// -> https://arxiv.org/pdf/2403.11703
|
// -> https://arxiv.org/pdf/2403.11703
|
||||||
// -> https://github.com/thunlp/LLaVA-UHD
|
// -> https://github.com/thunlp/LLaVA-UHD
|
||||||
@ -657,7 +682,11 @@ struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx *
|
|||||||
for (size_t j = 0; j < imgs[i].size(); ++j) {
|
for (size_t j = 0; j < imgs[i].size(); ++j) {
|
||||||
float* image_embed = NULL;
|
float* image_embed = NULL;
|
||||||
int n_image_pos = 0;
|
int n_image_pos = 0;
|
||||||
bool image_embed_result = llava_image_embed_make_with_clip_img_uhd(ctx_clip, n_threads, imgs[i][j], &image_embed, &n_image_pos);
|
int patch_size=14;
|
||||||
|
std::pair<int, int> load_image_size;
|
||||||
|
load_image_size.first = imgs[i][j]->nx;
|
||||||
|
load_image_size.second = imgs[i][j]->ny;
|
||||||
|
bool image_embed_result = llava_image_embed_make_with_clip_img_uhd(ctx_clip, n_threads, only_v2_5_reshape_by_patch(imgs[i][j], patch_size), &image_embed, &n_image_pos, load_image_size);
|
||||||
if (!image_embed_result) {
|
if (!image_embed_result) {
|
||||||
LOG_TEE("%s: coulnd't embed the image\n", __func__);
|
LOG_TEE("%s: coulnd't embed the image\n", __func__);
|
||||||
return NULL;
|
return NULL;
|
||||||
@ -672,7 +701,7 @@ struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx *
|
|||||||
return results;
|
return results;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llava_image_embed_make_with_clip_img_uhd(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
|
bool llava_image_embed_make_with_clip_img_uhd(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out, std::pair<int, int> load_image_size) {
|
||||||
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model
|
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model
|
||||||
if (!image_embd) {
|
if (!image_embd) {
|
||||||
LOG_TEE("Unable to allocate memory for image embeddings\n");
|
LOG_TEE("Unable to allocate memory for image embeddings\n");
|
||||||
@ -680,7 +709,7 @@ bool llava_image_embed_make_with_clip_img_uhd(clip_ctx * ctx_clip, int n_threads
|
|||||||
}
|
}
|
||||||
|
|
||||||
int n_img_pos;
|
int n_img_pos;
|
||||||
if (!encode_image_with_clip_uhd(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
|
if (!encode_image_with_clip_uhd(ctx_clip, n_threads, img, image_embd, &n_img_pos, load_image_size)) {
|
||||||
LOG_TEE("%s: cannot encode image, aborting\n", __func__);
|
LOG_TEE("%s: cannot encode image, aborting\n", __func__);
|
||||||
free(image_embd);
|
free(image_embd);
|
||||||
return false;
|
return false;
|
||||||
|
@ -47,7 +47,7 @@ LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
|
|||||||
/** build an image embed from image file bytes */
|
/** build an image embed from image file bytes */
|
||||||
LLAVA_API struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img);
|
LLAVA_API struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img);
|
||||||
/** build an image embed from a path to an image filename */
|
/** build an image embed from a path to an image filename */
|
||||||
LLAVA_API bool llava_image_embed_make_with_clip_img_uhd(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
|
LLAVA_API bool llava_image_embed_make_with_clip_img_uhd(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out, std::pair<int, int> load_image_size = {448, 448});
|
||||||
LLAVA_API bool llava_image_embed_make_with_clip_img_ollama(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
|
LLAVA_API bool llava_image_embed_make_with_clip_img_ollama(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
|
||||||
LLAVA_API struct uhd_image_embed * llava_image_embed_make_with_filename_uhd(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
|
LLAVA_API struct uhd_image_embed * llava_image_embed_make_with_filename_uhd(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
|
||||||
LLAVA_API void llava_image_embed_free_uhd(struct uhd_image_embed * embed);
|
LLAVA_API void llava_image_embed_free_uhd(struct uhd_image_embed * embed);
|
||||||
|
Loading…
Reference in New Issue
Block a user