mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-27 04:23:06 +01:00
fix uhd code for review comment
This commit is contained in:
parent
6fd0937e9f
commit
107e1edb20
@ -1583,19 +1583,6 @@ static void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32*
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void uhd_normalize_image_u8_to_f32(struct clip_ctx * ctx, const clip_image_u8* src, clip_image_f32* dst) {
|
|
||||||
dst->nx = src->nx;
|
|
||||||
dst->ny = src->ny;
|
|
||||||
dst->buf.resize(src->buf.size());
|
|
||||||
const auto & m3 = ctx->image_mean;
|
|
||||||
const auto & s3 = ctx->image_std;
|
|
||||||
|
|
||||||
for (size_t i = 0; i < src->buf.size(); ++i) {
|
|
||||||
int c = i % 3; // rgb
|
|
||||||
dst->buf[i] = (static_cast<float>(src->buf[i]) / 255.0f - m3[c]) / s3[c];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
inline float clip(float x, float lower, float upper) {
|
inline float clip(float x, float lower, float upper) {
|
||||||
return std::max(lower, std::min(x, upper));
|
return std::max(lower, std::min(x, upper));
|
||||||
}
|
}
|
||||||
@ -1764,6 +1751,17 @@ static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & im
|
|||||||
// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
|
// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
|
||||||
// res_imgs memory is being allocated here, previous allocations will be freed if found
|
// res_imgs memory is being allocated here, previous allocations will be freed if found
|
||||||
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
|
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
|
||||||
|
|
||||||
|
if(clip_is_minicpmv(ctx)){
|
||||||
|
clip_image_f32 * res = clip_image_f32_init();
|
||||||
|
normalize_image_u8_to_f32(img, res, ctx->image_mean, ctx->image_std);
|
||||||
|
res_imgs->size = 1;
|
||||||
|
res_imgs->data = new clip_image_f32[res_imgs->size];
|
||||||
|
res_imgs->data[0] = *res;
|
||||||
|
clip_image_f32_free(res);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
bool pad_to_square = true;
|
bool pad_to_square = true;
|
||||||
if (!ctx->has_vision_encoder) {
|
if (!ctx->has_vision_encoder) {
|
||||||
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
||||||
@ -2390,3 +2388,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
|||||||
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
|
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
|
||||||
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool clip_is_minicpmv(const struct clip_ctx * ctx) {
|
||||||
|
return ctx->has_minicpmv_projector;
|
||||||
|
}
|
@ -77,8 +77,6 @@ CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t byt
|
|||||||
/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
|
/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
|
||||||
CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
|
CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
|
||||||
|
|
||||||
CLIP_API void uhd_normalize_image_u8_to_f32(struct clip_ctx * ctx, const clip_image_u8* src, clip_image_f32* dst);
|
|
||||||
|
|
||||||
CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
|
CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
|
||||||
|
|
||||||
CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
|
CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
|
||||||
@ -86,6 +84,8 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons
|
|||||||
|
|
||||||
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
|
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
|
||||||
|
|
||||||
|
CLIP_API bool clip_is_minicpmv(const struct clip_ctx * ctx);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -413,32 +413,6 @@ void llava_image_embed_free(struct llava_image_embed * embed) {
|
|||||||
free(embed);
|
free(embed);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool encode_image_with_clip_uhd(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
|
|
||||||
// std::vector<clip_image_f32*> img_res_v;
|
|
||||||
// format VectN x H x W x RGB (N x 448 x 448 x 3)
|
|
||||||
clip_image_f32 * img_res_v = clip_image_f32_init();
|
|
||||||
uhd_normalize_image_u8_to_f32(ctx_clip, img, img_res_v);
|
|
||||||
|
|
||||||
const int64_t t_img_enc_start_us = ggml_time_us();
|
|
||||||
|
|
||||||
const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
|
|
||||||
LOG_TEE("\n%s: mm_patch_merge_type is %s.\n", __func__, mm_patch_merge_type);
|
|
||||||
|
|
||||||
*n_img_pos = clip_n_patches(ctx_clip);
|
|
||||||
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res_v, image_embd); // image_embd shape is 96 x 4096
|
|
||||||
if (!encoded) {
|
|
||||||
LOG_TEE("Unable to encode image\n");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
LOG_TEE("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
|
|
||||||
|
|
||||||
const int64_t t_img_enc_end_us = ggml_time_us();
|
|
||||||
float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
|
|
||||||
LOG_TEE("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ensure_divide(int length, int patch_size) {
|
static int ensure_divide(int length, int patch_size) {
|
||||||
return std::max(static_cast<int>(std::round(static_cast<float>(length) / patch_size) * patch_size), patch_size);
|
return std::max(static_cast<int>(std::round(static_cast<float>(length) / patch_size) * patch_size), patch_size);
|
||||||
}
|
}
|
||||||
@ -691,7 +665,7 @@ struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx *
|
|||||||
load_image_size->height = imgs[i][j]->ny;
|
load_image_size->height = imgs[i][j]->ny;
|
||||||
LOG_TEE("%s : %d %d\n", __func__, load_image_size->width, load_image_size->height);
|
LOG_TEE("%s : %d %d\n", __func__, load_image_size->width, load_image_size->height);
|
||||||
clip_add_load_image_size(ctx_clip, load_image_size);
|
clip_add_load_image_size(ctx_clip, load_image_size);
|
||||||
bool image_embed_result = llava_image_embed_make_with_clip_img_uhd(ctx_clip, n_threads, only_v2_5_reshape_by_patch(imgs[i][j], patch_size), &image_embed, &n_image_pos);
|
bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, only_v2_5_reshape_by_patch(imgs[i][j], patch_size), &image_embed, &n_image_pos);
|
||||||
if (!image_embed_result) {
|
if (!image_embed_result) {
|
||||||
LOG_TEE("%s: coulnd't embed the image\n", __func__);
|
LOG_TEE("%s: coulnd't embed the image\n", __func__);
|
||||||
return NULL;
|
return NULL;
|
||||||
@ -706,25 +680,6 @@ struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx *
|
|||||||
return results;
|
return results;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llava_image_embed_make_with_clip_img_uhd(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
|
|
||||||
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model
|
|
||||||
if (!image_embd) {
|
|
||||||
LOG_TEE("Unable to allocate memory for image embeddings\n");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
int n_img_pos;
|
|
||||||
if (!encode_image_with_clip_uhd(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
|
|
||||||
LOG_TEE("%s: cannot encode image, aborting\n", __func__);
|
|
||||||
free(image_embd);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
*image_embd_out = image_embd;
|
|
||||||
*n_img_pos_out = n_img_pos;
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct uhd_image_embed * llava_image_embed_make_with_filename_uhd(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) {
|
struct uhd_image_embed * llava_image_embed_make_with_filename_uhd(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) {
|
||||||
unsigned char* image_bytes;
|
unsigned char* image_bytes;
|
||||||
long image_bytes_length;
|
long image_bytes_length;
|
||||||
|
@ -45,7 +45,6 @@ LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
|
|||||||
/** build an image embed from image file bytes */
|
/** build an image embed from image file bytes */
|
||||||
LLAVA_API struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img);
|
LLAVA_API struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img);
|
||||||
/** build an image embed from a path to an image filename */
|
/** build an image embed from a path to an image filename */
|
||||||
LLAVA_API bool llava_image_embed_make_with_clip_img_uhd(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
|
|
||||||
LLAVA_API struct uhd_image_embed * llava_image_embed_make_with_filename_uhd(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
|
LLAVA_API struct uhd_image_embed * llava_image_embed_make_with_filename_uhd(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
|
||||||
LLAVA_API void llava_image_embed_free_uhd(struct uhd_image_embed * embed);
|
LLAVA_API void llava_image_embed_free_uhd(struct uhd_image_embed * embed);
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user