rename everywhere

This commit is contained in:
Xuan Son Nguyen 2025-01-21 15:53:39 +01:00
parent bd0714b977
commit ad38e87329
7 changed files with 355 additions and 331 deletions

View File

@ -122,7 +122,7 @@ int main(int argc, char ** argv) {
int n_prompt = 0; int n_prompt = 0;
// process image // process image
llama_vision_patches * img_patches = nullptr; llama_vision_tokens * img_tokens = nullptr;
{ {
const char * img_path = params.image[0].c_str(); const char * img_path = params.image[0].c_str();
if (params.image[0].empty()) { if (params.image[0].empty()) {
@ -131,12 +131,12 @@ int main(int argc, char ** argv) {
} }
llama_vision_bitmap * img = load_image_from_file(img_path); llama_vision_bitmap * img = load_image_from_file(img_path);
LOG_INF("loaded image %s, size = %d x %d\n", img_path, img->nx, img->ny); LOG_INF("loaded image %s, size = %d x %d\n", img_path, img->nx, img->ny);
img_patches = llama_vision_patches_init(ctx, img); img_tokens = llama_vision_tokenize(ctx, img);
if (!img_patches) { if (!img_tokens) {
LOG_ERR("failed to create image patches\n"); LOG_ERR("failed to create image tokens\n");
return 1; return 1;
} }
if (llama_vision_encode(ctx, img_patches)) { if (llama_vision_encode(ctx, img_tokens)) {
LOG_ERR("failed to encode image\n"); LOG_ERR("failed to encode image\n");
return 1; return 1;
} }

View File

@ -229,7 +229,9 @@ extern "C" {
bool sorted; bool sorted;
} llama_token_data_array; } llama_token_data_array;
struct llama_vision_patches; // Structure represents the basic input unit of vision model
// This can be a processed image or slices of images under the hood
struct llama_vision_tokens;
// represent an RGB image // represent an RGB image
// size of data must be equal to 3*nx*ny // size of data must be equal to 3*nx*ny
@ -1286,12 +1288,15 @@ extern "C" {
LLAMA_API struct llama_vision_bitmap * llama_vision_bitmap_init(uint32_t nx, uint32_t ny); LLAMA_API struct llama_vision_bitmap * llama_vision_bitmap_init(uint32_t nx, uint32_t ny);
LLAMA_API void llama_vision_bitmap_free(struct llama_vision_bitmap * bmp); LLAMA_API void llama_vision_bitmap_free(struct llama_vision_bitmap * bmp);
// Create patches from the RGB bitmap // Create image tokens from the RGB bitmap
LLAMA_API struct llama_vision_patches * llama_vision_patches_init(struct llama_context * ctx, llama_vision_bitmap * bmp); LLAMA_API struct llama_vision_tokens * llama_vision_tokenize(struct llama_context * ctx, llama_vision_bitmap * bmp);
LLAMA_API void llama_vision_patches_free(struct llama_vision_patches * p); LLAMA_API void llama_vision_tokens_free(struct llama_vision_tokens * img_tokens);
// User must reserve N number of tokens in tokenized text prompt for each image
// LLAMA_API int32_t llama_vision_get_n_tokens(const llama_vision_img_tokens * img_tokens);
// Encode patches into embeddings // Encode patches into embeddings
LLAMA_API int32_t llama_vision_encode(struct llama_context * ctx, struct llama_vision_patches * p); LLAMA_API int32_t llama_vision_encode(struct llama_context * ctx, struct llama_vision_tokens * img_tokens);
LLAMA_API struct ggml_tensor * llama_vision_get_output_tensor(struct llama_context * ctx); LLAMA_API struct ggml_tensor * llama_vision_get_output_tensor(struct llama_context * ctx);
// //

View File

@ -110,7 +110,7 @@ struct llama_context {
struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch] struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
// vision // vision
clip_context vctx; llama_vision_context vctx;
}; };
// TODO: make these methods of llama_context // TODO: make these methods of llama_context

View File

@ -1268,8 +1268,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
{ {
std::string name; std::string name;
ml.get_key(LLM_KV_VISION_VIT_PROJECTOR_TYPE, name, true); ml.get_key(LLM_KV_VISION_VIT_PROJECTOR_TYPE, name, true);
vparams.proj_type = clip_projector_type_from_name(name); vparams.proj_type = vision_projector_type_from_name(name);
if (vparams.proj_type == CLIP_PROJECTOR_TYPE_UNKNOWN) { if (vparams.proj_type == VISION_PROJECTOR_TYPE_UNKNOWN) {
throw std::runtime_error(format("unsupported clip projector type: %s", name.c_str())); throw std::runtime_error(format("unsupported clip projector type: %s", name.c_str()));
} }
} }
@ -3514,7 +3514,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
throw std::runtime_error("unknown vision architecture"); throw std::runtime_error("unknown vision architecture");
} }
if (clip_n_mmproj_embd(clip) != hparams.n_embd) { if (llama_vision_n_mmproj_embd(clip) != hparams.n_embd) {
std::runtime_error("model has vision, but n_mmproj_embd != n_embd"); std::runtime_error("model has vision, but n_mmproj_embd != n_embd");
} }
} }

View File

@ -365,7 +365,7 @@ struct llama_model {
// vision // vision
bool has_vision = false; bool has_vision = false;
clip_vision_model clip; llama_vision_model clip;
private: private:
struct impl; struct impl;

View File

@ -13,25 +13,25 @@
#include <cstdint> #include <cstdint>
#include <iostream> #include <iostream>
// export clip_image_u8 to bmp file for debugging // export llama_image_u8 to bmp file for debugging
// https://codereview.stackexchange.com/questions/195121/writing-a-bitmap-image-from-c // https://codereview.stackexchange.com/questions/195121/writing-a-bitmap-image-from-c
struct clip_image_size; struct img_size;
static int bmp_export(const struct clip_image_u8 &img, const std::string &location); static int bmp_export(const struct llama_image_u8 &img, const std::string &location);
#endif #endif
struct clip_image_size { struct img_size {
int width; int width;
int height; int height;
}; };
// RGB uint8 image // RGB uint8 image
// Memory layout: RGBRGBRGB... // Memory layout: RGBRGBRGB...
struct clip_image_u8 { struct llama_image_u8 {
int nx; int nx;
int ny; int ny;
std::vector<uint8_t> buf; std::vector<uint8_t> buf;
clip_image_u8() {} llama_image_u8() {}
clip_image_u8(const llama_vision_bitmap & bmp) { llama_image_u8(const llama_vision_bitmap & bmp) {
nx = bmp.nx; nx = bmp.nx;
ny = bmp.ny; ny = bmp.ny;
buf.resize(nx*ny*3); buf.resize(nx*ny*3);
@ -39,39 +39,43 @@ struct clip_image_u8 {
} }
}; };
struct clip_image_u8_batch { uint32_t llama_vision_n_mmproj_embd(const llama_vision_model & vmodel) {
struct clip_image_u8 * data; auto & proj_type = vmodel.hparams.proj_type;
size_t size; if (proj_type == VISION_PROJECTOR_TYPE_MLP) {
}; return vmodel.mm_2_b->ne[0];
} else if (proj_type == VISION_PROJECTOR_TYPE_LDPV2) {
static int clip_n_patches_x(const clip_context & ctx) { return vmodel.mm_model_peg_0_b->ne[0];
auto & hparams = ctx.model->hparams; } else if (proj_type == VISION_PROJECTOR_TYPE_MINICPMV_2_5) {
return hparams.image_size / hparams.patch_size;
}
static int clip_n_patches_y(const clip_context & ctx) {
return clip_n_patches_x(ctx);
}
static int clip_n_patches(const clip_context & ctx) {
return clip_n_patches_x(ctx) * clip_n_patches_y(ctx);
}
uint32_t clip_n_mmproj_embd(const clip_vision_model & clip_model) {
auto & proj_type = clip_model.hparams.proj_type;
if (proj_type == CLIP_PROJECTOR_TYPE_MLP) {
return clip_model.mm_2_b->ne[0];
} else if (proj_type == CLIP_PROJECTOR_TYPE_LDPV2) {
return clip_model.mm_model_peg_0_b->ne[0];
} else if (proj_type == CLIP_PROJECTOR_TYPE_MINICPMV_2_5) {
return 4096; return 4096;
} else if (proj_type == CLIP_PROJECTOR_TYPE_MINICPMV_2_6) { } else if (proj_type == VISION_PROJECTOR_TYPE_MINICPMV_2_6) {
return 3584; return 3584;
} else { } else {
GGML_ASSERT(false && "invalid proj type"); GGML_ASSERT(false && "invalid proj type");
} }
} }
//
// internal utils
//
static int get_n_patches_x(const llama_vision_context & ctx) {
auto & hparams = ctx.model->hparams;
return hparams.image_size / hparams.patch_size;
}
static int get_n_patches_y(const llama_vision_context & ctx) {
return get_n_patches_x(ctx);
}
static int get_n_patches(const llama_vision_context & ctx) {
return get_n_patches_x(ctx) * get_n_patches_y(ctx);
}
//
// bitmap utils
//
/** /**
* Selects the best resolution from a list of possible resolutions based on the original size. * Selects the best resolution from a list of possible resolutions based on the original size.
* *
@ -79,11 +83,11 @@ uint32_t clip_n_mmproj_embd(const clip_vision_model & clip_model) {
* @param possible_resolutions A list of possible resolutions in the format [(width1, height1), (width2, height2), ...]. * @param possible_resolutions A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
* @return The best fit resolution in the format (width, height). * @return The best fit resolution in the format (width, height).
*/ */
static clip_image_size select_best_resolution(const clip_image_size & original_size, const std::vector<clip_image_size>& possible_resolutions) { static img_size select_best_resolution(const img_size & original_size, const std::vector<img_size>& possible_resolutions) {
int original_width = original_size.width; int original_width = original_size.width;
int original_height = original_size.height; int original_height = original_size.height;
clip_image_size best_fit; img_size best_fit;
int max_effective_resolution = 0; int max_effective_resolution = 0;
int min_wasted_resolution = std::numeric_limits<int>::max(); int min_wasted_resolution = std::numeric_limits<int>::max();
@ -106,7 +110,7 @@ static clip_image_size select_best_resolution(const clip_image_size & original_s
return best_fit; return best_fit;
} }
static bool bicubic_resize(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) { static bool bicubic_resize(const llama_image_u8 & img, llama_image_u8 & dst, int target_width, int target_height) {
auto clip = [](int x, int lower, int upper) -> int { auto clip = [](int x, int lower, int upper) -> int {
return std::max(lower, std::min(x, upper)); return std::max(lower, std::min(x, upper));
}; };
@ -173,13 +177,13 @@ static bool bicubic_resize(const clip_image_u8 & img, clip_image_u8 & dst, int t
return true; return true;
} }
static std::vector<clip_image_u8> divide_to_patches_u8(const clip_image_u8 & image, int patch_size) { static std::vector<llama_image_u8> divide_to_patches_u8(const llama_image_u8 & image, int patch_size) {
std::vector<clip_image_u8> patches; std::vector<llama_image_u8> patches;
int width = image.nx; int width = image.nx;
int height = image.ny; int height = image.ny;
for (int i = 0; i < height; i += patch_size) { for (int i = 0; i < height; i += patch_size) {
for (int j = 0; j < width; j += patch_size) { for (int j = 0; j < width; j += patch_size) {
clip_image_u8 patch; llama_image_u8 patch;
patch.nx = std::min(patch_size, width - j); patch.nx = std::min(patch_size, width - j);
patch.ny = std::min(patch_size, height - i); patch.ny = std::min(patch_size, height - i);
patch.buf.resize(3 * patch.nx * patch.ny); patch.buf.resize(3 * patch.nx * patch.ny);
@ -197,7 +201,7 @@ static std::vector<clip_image_u8> divide_to_patches_u8(const clip_image_u8 & ima
} }
// llava-1.6 type of resize_and_pad (black) // llava-1.6 type of resize_and_pad (black)
static clip_image_u8 resize_and_pad_image(const clip_image_u8 & image, const clip_image_size & target_resolution) { static llama_image_u8 resize_and_pad_image(const llama_image_u8 & image, const img_size & target_resolution) {
int target_width = target_resolution.width; int target_width = target_resolution.width;
int target_height = target_resolution.height; int target_height = target_resolution.height;
@ -214,11 +218,11 @@ static clip_image_u8 resize_and_pad_image(const clip_image_u8 & image, const cli
new_width = std::min(static_cast<int>(std::ceil(image.nx * scale_h)), target_width); new_width = std::min(static_cast<int>(std::ceil(image.nx * scale_h)), target_width);
} }
clip_image_u8 resized_image; llama_image_u8 resized_image;
// bilinear_resize(image, resized_image, new_width, new_height); // bilinear_resize(image, resized_image, new_width, new_height);
bicubic_resize(image, resized_image, new_width, new_height); bicubic_resize(image, resized_image, new_width, new_height);
clip_image_u8 padded_image; llama_image_u8 padded_image;
padded_image.nx = target_width; padded_image.nx = target_width;
padded_image.ny = target_height; padded_image.ny = target_height;
padded_image.buf.resize(3 * target_width * target_height, 0); // Initialize with black padded_image.buf.resize(3 * target_width * target_height, 0); // Initialize with black
@ -238,7 +242,7 @@ static clip_image_u8 resize_and_pad_image(const clip_image_u8 & image, const cli
return padded_image; return padded_image;
} }
static void normalize_image_u8_to_f32(const clip_image_u8 & src, std::vector<float> & dst, const std::array<float, 3> & mean, const std::array<float, 3> & std) { static void normalize_image_u8_to_f32(const llama_image_u8 & src, std::vector<float> & dst, const std::array<float, 3> & mean, const std::array<float, 3> & std) {
dst.resize(src.buf.size()); dst.resize(src.buf.size());
for (size_t i = 0; i < src.buf.size(); ++i) { for (size_t i = 0; i < src.buf.size(); ++i) {
@ -247,176 +251,23 @@ static void normalize_image_u8_to_f32(const clip_image_u8 & src, std::vector<flo
} }
} }
#define LLAMA_LOG_DEBUG LLAMA_LOG_INFO
// minicpmv preprocessor //
struct minicpmv_preprocessor { // processor
int ensure_divide(int length, int patch_size) { //
return std::max(static_cast<int>(std::round(static_cast<float>(length) / patch_size) * patch_size), patch_size);
}
std::pair<int, int> uhd_find_best_resize(std::pair<int, int> original_size, int scale_resolution, int patch_size, bool allow_upscale = false) { struct llama_vision_processor {
int width = original_size.first; const llama_vision_context & ctx;
int height = original_size.second; llama_vision_processor(const llama_vision_context & ctx) : ctx(ctx) {}
if ((width * height > scale_resolution * scale_resolution) || allow_upscale) { virtual llama_vision_tokens tokenize(const llama_image_u8 & img) = 0;
float r = static_cast<float>(width) / height; virtual ~llama_vision_processor() = default;
height = static_cast<int>(scale_resolution / std::sqrt(r));
width = static_cast<int>(height * r);
}
int best_width = ensure_divide(width, patch_size);
int best_height = ensure_divide(height, patch_size);
return std::make_pair(best_width, best_height);
}
std::pair<int, int> uhd_get_refine_size(std::pair<int, int> original_size, std::pair<int, int> grid, int scale_resolution, int patch_size, bool allow_upscale = false) {
int width, height;
std::tie(width, height) = original_size;
int grid_x, grid_y;
std::tie(grid_x, grid_y) = grid;
int refine_width = ensure_divide(width, grid_x);
int refine_height = ensure_divide(height, grid_y);
int grid_width = refine_width / grid_x;
int grid_height = refine_height / grid_y;
// auto best_grid_size = find_best_resize(std::make_tuple(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); (old line)
auto best_grid_size = uhd_find_best_resize(std::make_pair(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); // (new line) => fixes conversion for make_tuple to make_pair
int best_grid_width, best_grid_height;
std::tie(best_grid_width, best_grid_height) = best_grid_size;
// std::pair<int, int> refine_size = std::make_tuple(best_grid_width * grid_x, best_grid_height * grid_y); (old line)
std::pair<int, int> refine_size = std::make_pair(best_grid_width * grid_x, best_grid_height * grid_y); // (new line)
return refine_size;
}
std::pair<int, int> uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
std::vector<int> candidate_split_grids_nums;
for (int i : {multiple - 1, multiple, multiple + 1}) {
if (i == 1 || i > max_slice_nums) {
continue;
}
candidate_split_grids_nums.push_back(i);
}
std::vector<std::pair<int, int>> candidate_grids;
for (int split_grids_nums : candidate_split_grids_nums) {
int m = 1;
while (m <= split_grids_nums) {
if (split_grids_nums % m == 0) {
candidate_grids.emplace_back(m, split_grids_nums / m);
}
++m;
}
}
std::pair<int, int> best_grid{1, 1};
float min_error = std::numeric_limits<float>::infinity();
for (const auto& grid : candidate_grids) {
float error = std::abs(log_ratio - std::log(1.0 * grid.first / grid.second));
if (error < min_error) {
best_grid = grid;
min_error = error;
}
}
return best_grid;
}
std::vector<std::vector<clip_image_u8>> uhd_slice_image(
const clip_image_u8 & img,
const int max_slice_nums = 9,
const int scale_resolution = 448,
const int patch_size = 14) {
const std::pair<int, int> original_size={img.nx,img.ny};
const int original_width = img.nx;
const int original_height = img.ny;
const float log_ratio = log(1.0*original_width/original_height);
const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution);
const int multiple = fmin(ceil(ratio), max_slice_nums);
std::vector<std::vector<clip_image_u8>> images;
LLAMA_LOG_DEBUG("%s: multiple %d\n", __func__, multiple);
images.push_back(std::vector<clip_image_u8>());
if (multiple <= 1) {
auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size, true);
clip_image_u8 source_image;
bicubic_resize(img, source_image, best_size.first, best_size.second);
// source_image = image.resize(best_size, Image.Resampling.BICUBIC)
images[images.size()-1].push_back(source_image);
}
else if (multiple > 1) {
auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size);
clip_image_u8 source_image;
bicubic_resize(img, source_image, best_size.first, best_size.second);
// source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
LLAMA_LOG_DEBUG("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img.nx, img.ny, best_size.first, best_size.second);
images[images.size()-1].push_back(source_image);
std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
LLAMA_LOG_DEBUG("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img.nx, img.ny, best_grid.first, best_grid.second);
auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true);
clip_image_u8 refine_image;
bicubic_resize(img, refine_image, refine_size.first, refine_size.second);
LLAMA_LOG_DEBUG("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image.nx, refine_image.ny, refine_size.first, refine_size.second);
// split_to_patches
int width = refine_image.nx;
int height = refine_image.ny;
int grid_x = int(width / best_grid.first);
int grid_y = int(height / best_grid.second);
for (int patches_i = 0, ic = 0; patches_i < height && ic < best_grid.second; patches_i += grid_y, ic += 1){
images.push_back(std::vector<clip_image_u8>());
for(int patches_j = 0, jc = 0; patches_j < width && jc < best_grid.first; patches_j += grid_x, jc += 1){
clip_image_u8 patch;
patch.nx = grid_x;
patch.ny = grid_y;
patch.buf.resize(3 * patch.nx * patch.ny);
for (int y = patches_i; y < patches_i + grid_y; ++y) {
for (int x = patches_j; x < patches_j + grid_x; ++x) {
const int i = 3 * (y * refine_image.nx + x);
const int j = 3 * ((y-patches_i) * patch.nx + (x-patches_j));
patch.buf[j] = refine_image.buf[i];
patch.buf[j+1] = refine_image.buf[i+1];
patch.buf[j+2] = refine_image.buf[i+2];
}
}
images[images.size()-1].push_back(patch);
}
}
}
return images;
}
}; };
static llama_vision_patches clip_image_preprocess_minicpmv(const clip_context & ctx, const clip_image_u8 & img) { // inspired by https://github.com/huggingface/transformers/blob/main/src/transformers/models/llava/processing_llava.py
auto & params = ctx.model->hparams; struct llama_vision_processor_llava : llama_vision_processor {
GGML_ASSERT(params.arch == LLM_ARCH_VISION_MINICPMV); llama_vision_processor_llava(const llama_vision_context & ctx) : llama_vision_processor(ctx) {}
static const int max_slice_nums = 9; virtual llama_vision_tokens tokenize(const llama_image_u8 & img) override {
minicpmv_preprocessor preprocessor;
std::vector<std::vector<clip_image_u8>> imgs = preprocessor.uhd_slice_image(img, max_slice_nums);
llama_vision_patches output_patches;
output_patches.n_px = clip_n_patches_x(ctx);
output_patches.n_py = clip_n_patches_y(ctx);
output_patches.px = params.patch_size;
output_patches.py = params.patch_size;
for (size_t i = 0; i < imgs.size(); ++i) {
for (size_t j = 0; j < imgs[i].size(); ++j) {
std::vector<float> res;
normalize_image_u8_to_f32(imgs[i][j], res, params.image_mean, params.image_std);
output_patches.buf.push_back(res);
}
}
}
// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
// res_imgs memory is being allocated here, previous allocations will be freed if found
static llama_vision_patches clip_image_preprocess(const clip_context & ctx, const clip_image_u8 & img) {
bool pad_to_square = true; bool pad_to_square = true;
auto & params = ctx.model->hparams; auto & params = ctx.model->hparams;
// The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing
@ -424,16 +275,16 @@ static llama_vision_patches clip_image_preprocess(const clip_context & ctx, cons
pad_to_square = false; pad_to_square = false;
} }
llama_vision_patches output_patches; llama_vision_tokens output_slices;
output_patches.n_px = clip_n_patches_x(ctx); output_slices.n_px = get_n_patches_x(ctx);
output_patches.n_py = clip_n_patches_y(ctx); output_slices.n_py = get_n_patches_y(ctx);
output_patches.px = params.patch_size; output_slices.px = params.patch_size;
output_patches.py = params.patch_size; output_slices.py = params.patch_size;
// the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104) // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
// see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
clip_image_u8 temp; llama_image_u8 temp;
if (pad_to_square && img.nx != img.ny) { if (pad_to_square && img.nx != img.ny) {
// if the image is not square, pad it to a square // if the image is not square, pad it to a square
int longer_side = std::max(img.nx, img.ny); int longer_side = std::max(img.nx, img.ny);
@ -459,32 +310,31 @@ static llama_vision_patches clip_image_preprocess(const clip_context & ctx, cons
} }
} else if (params.image_grid_pinpoints[0] != 0) { } else if (params.image_grid_pinpoints[0] != 0) {
// "spatial_unpad" with "anyres" processing for llava-1.6 // "spatial_unpad" with "anyres" processing for llava-1.6
std::vector<clip_image_size> possible_resolutions; std::vector<img_size> possible_resolutions;
for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i += 2) { for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i += 2) {
clip_image_size s; img_size s;
s.width = params.image_grid_pinpoints[i]; s.width = params.image_grid_pinpoints[i];
s.height = params.image_grid_pinpoints[i+1]; s.height = params.image_grid_pinpoints[i+1];
possible_resolutions.push_back(s); possible_resolutions.push_back(s);
} }
clip_image_size best_resolution = select_best_resolution({img.nx, img.ny}, possible_resolutions); img_size best_resolution = select_best_resolution({img.nx, img.ny}, possible_resolutions);
// clip_image_save_to_bmp(*img, "input.bmp"); // debug_image_save_to_bmp(*img, "input.bmp");
temp = resize_and_pad_image(img, best_resolution); // we do not pad with mean-bg color anymore in llava-1.6 temp = resize_and_pad_image(img, best_resolution); // we do not pad with mean-bg color anymore in llava-1.6
// clip_image_save_to_bmp(*temp, "resized.bmp"); // debug_image_save_to_bmp(*temp, "resized.bmp");
std::vector<clip_image_u8> patches = divide_to_patches_u8(temp, params.image_size); // prepare spatial sorted main patches of image_size each (336 in llava-1.6) std::vector<llama_image_u8> patches = divide_to_patches_u8(temp, params.image_size); // prepare spatial sorted main patches of image_size each (336 in llava-1.6)
clip_image_u8 image_original_resize; llama_image_u8 image_original_resize;
// bilinear_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square // bilinear_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
bicubic_resize(img, image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square bicubic_resize(img, image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
patches.insert(patches.begin(), image_original_resize); patches.insert(patches.begin(), image_original_resize);
// clip_image_f32_batch_init(patches.size()); output_slices.buf.resize(patches.size());
output_patches.buf.resize(patches.size());
int num = 0; int num = 0;
for (auto & patch : patches) { for (auto & patch : patches) {
normalize_image_u8_to_f32(patch, output_patches.buf[num], params.image_mean, params.image_std); normalize_image_u8_to_f32(patch, output_slices.buf[num], params.image_mean, params.image_std);
num++; num++;
} }
return output_patches; return output_slices;
} else { } else {
temp.nx = img.nx; temp.nx = img.nx;
temp.ny = img.ny; temp.ny = img.ny;
@ -549,13 +399,180 @@ static llama_vision_patches clip_image_preprocess(const clip_context & ctx, cons
} }
} }
output_patches.buf.resize(1); output_slices.buf.resize(1);
output_patches.buf[0] = std::move(res); output_slices.buf[0] = std::move(res);
return output_patches; return output_slices;
} };
};
static ggml_cgraph * clip_image_build_graph(clip_context & ctx, int batch_size, clip_image_size & image_size) { struct llama_vision_processor_uhd : llama_vision_processor {
llama_vision_processor_uhd(const llama_vision_context & ctx) : llama_vision_processor(ctx) {}
int ensure_divide(int length, int patch_size) {
return std::max(static_cast<int>(std::round(static_cast<float>(length) / patch_size) * patch_size), patch_size);
}
std::pair<int, int> find_best_resize(std::pair<int, int> original_size, int scale_resolution, int patch_size, bool allow_upscale = false) {
int width = original_size.first;
int height = original_size.second;
if ((width * height > scale_resolution * scale_resolution) || allow_upscale) {
float r = static_cast<float>(width) / height;
height = static_cast<int>(scale_resolution / std::sqrt(r));
width = static_cast<int>(height * r);
}
int best_width = ensure_divide(width, patch_size);
int best_height = ensure_divide(height, patch_size);
return std::make_pair(best_width, best_height);
}
std::pair<int, int> get_refine_size(std::pair<int, int> original_size, std::pair<int, int> grid, int scale_resolution, int patch_size, bool allow_upscale = false) {
int width, height;
std::tie(width, height) = original_size;
int grid_x, grid_y;
std::tie(grid_x, grid_y) = grid;
int refine_width = ensure_divide(width, grid_x);
int refine_height = ensure_divide(height, grid_y);
int grid_width = refine_width / grid_x;
int grid_height = refine_height / grid_y;
// auto best_grid_size = find_best_resize(std::make_tuple(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); (old line)
auto best_grid_size = find_best_resize(std::make_pair(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); // (new line) => fixes conversion for make_tuple to make_pair
int best_grid_width, best_grid_height;
std::tie(best_grid_width, best_grid_height) = best_grid_size;
// std::pair<int, int> refine_size = std::make_tuple(best_grid_width * grid_x, best_grid_height * grid_y); (old line)
std::pair<int, int> refine_size = std::make_pair(best_grid_width * grid_x, best_grid_height * grid_y); // (new line)
return refine_size;
}
std::pair<int, int> find_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
std::vector<int> candidate_split_grids_nums;
for (int i : {multiple - 1, multiple, multiple + 1}) {
if (i == 1 || i > max_slice_nums) {
continue;
}
candidate_split_grids_nums.push_back(i);
}
std::vector<std::pair<int, int>> candidate_grids;
for (int split_grids_nums : candidate_split_grids_nums) {
int m = 1;
while (m <= split_grids_nums) {
if (split_grids_nums % m == 0) {
candidate_grids.emplace_back(m, split_grids_nums / m);
}
++m;
}
}
std::pair<int, int> best_grid{1, 1};
float min_error = std::numeric_limits<float>::infinity();
for (const auto& grid : candidate_grids) {
float error = std::abs(log_ratio - std::log(1.0 * grid.first / grid.second));
if (error < min_error) {
best_grid = grid;
min_error = error;
}
}
return best_grid;
}
std::vector<std::vector<llama_image_u8>> slice_image(
const llama_image_u8 & img,
const int max_slice_nums = 9,
const int scale_resolution = 448,
const int patch_size = 14) {
const std::pair<int, int> original_size={img.nx,img.ny};
const int original_width = img.nx;
const int original_height = img.ny;
const float log_ratio = log(1.0*original_width/original_height);
const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution);
const int multiple = fmin(ceil(ratio), max_slice_nums);
std::vector<std::vector<llama_image_u8>> images;
LLAMA_LOG_DEBUG("%s: multiple %d\n", __func__, multiple);
images.push_back(std::vector<llama_image_u8>());
if (multiple <= 1) {
auto best_size = find_best_resize(original_size, scale_resolution, patch_size, true);
llama_image_u8 source_image;
bicubic_resize(img, source_image, best_size.first, best_size.second);
// source_image = image.resize(best_size, Image.Resampling.BICUBIC)
images[images.size()-1].push_back(source_image);
}
else if (multiple > 1) {
auto best_size = find_best_resize(original_size, scale_resolution, patch_size);
llama_image_u8 source_image;
bicubic_resize(img, source_image, best_size.first, best_size.second);
// source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
LLAMA_LOG_DEBUG("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img.nx, img.ny, best_size.first, best_size.second);
images[images.size()-1].push_back(source_image);
std::pair<int, int> best_grid = find_best_grid(max_slice_nums, multiple, log_ratio);
LLAMA_LOG_DEBUG("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img.nx, img.ny, best_grid.first, best_grid.second);
auto refine_size = get_refine_size(original_size, best_grid, scale_resolution, patch_size, true);
llama_image_u8 refine_image;
bicubic_resize(img, refine_image, refine_size.first, refine_size.second);
LLAMA_LOG_DEBUG("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image.nx, refine_image.ny, refine_size.first, refine_size.second);
// split_to_patches
int width = refine_image.nx;
int height = refine_image.ny;
int grid_x = int(width / best_grid.first);
int grid_y = int(height / best_grid.second);
for (int patches_i = 0, ic = 0; patches_i < height && ic < best_grid.second; patches_i += grid_y, ic += 1){
images.push_back(std::vector<llama_image_u8>());
for(int patches_j = 0, jc = 0; patches_j < width && jc < best_grid.first; patches_j += grid_x, jc += 1){
llama_image_u8 patch;
patch.nx = grid_x;
patch.ny = grid_y;
patch.buf.resize(3 * patch.nx * patch.ny);
for (int y = patches_i; y < patches_i + grid_y; ++y) {
for (int x = patches_j; x < patches_j + grid_x; ++x) {
const int i = 3 * (y * refine_image.nx + x);
const int j = 3 * ((y-patches_i) * patch.nx + (x-patches_j));
patch.buf[j] = refine_image.buf[i];
patch.buf[j+1] = refine_image.buf[i+1];
patch.buf[j+2] = refine_image.buf[i+2];
}
}
images[images.size()-1].push_back(patch);
}
}
}
return images;
}
virtual llama_vision_tokens tokenize(const llama_image_u8 & img) override {
auto & params = ctx.model->hparams;
GGML_ASSERT(params.arch == LLM_ARCH_VISION_MINICPMV);
std::vector<std::vector<llama_image_u8>> imgs = slice_image(img);
llama_vision_tokens output;
output.n_px = get_n_patches_x(ctx);
output.n_py = get_n_patches_y(ctx);
output.px = params.patch_size;
output.py = params.patch_size;
for (size_t i = 0; i < imgs.size(); ++i) {
for (size_t j = 0; j < imgs[i].size(); ++j) {
std::vector<float> res;
normalize_image_u8_to_f32(imgs[i][j], res, params.image_mean, params.image_std);
output.buf.push_back(res);
}
}
return output;
}
};
static ggml_cgraph * llama_vision_build_graph(llama_vision_context & ctx, int batch_size, img_size & image_size) {
auto & model = *ctx.model; auto & model = *ctx.model;
auto & hparams = ctx.model->hparams; auto & hparams = ctx.model->hparams;
@ -726,7 +743,7 @@ static ggml_cgraph * clip_image_build_graph(clip_context & ctx, int batch_size,
// ne is whcn, ne = [1024, 576, 1, 1] // ne is whcn, ne = [1024, 576, 1, 1]
embeddings = ggml_get_rows(ctx0, embeddings, patches); embeddings = ggml_get_rows(ctx0, embeddings, patches);
if (hparams.proj_type == CLIP_PROJECTOR_TYPE_MLP) { if (hparams.proj_type == VISION_PROJECTOR_TYPE_MLP) {
embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings); embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_1_b); embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
@ -734,7 +751,7 @@ static ggml_cgraph * clip_image_build_graph(clip_context & ctx, int batch_size,
embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
} else if (hparams.proj_type == CLIP_PROJECTOR_TYPE_LDPV2) { } else if (hparams.proj_type == VISION_PROJECTOR_TYPE_LDPV2) {
int n_patch = 24; int n_patch = 24;
struct ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings); struct ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b); mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b);
@ -770,8 +787,8 @@ static ggml_cgraph * clip_image_build_graph(clip_context & ctx, int batch_size,
return gf; return gf;
} }
static int32_t clip_image_encode(clip_context & ctx, const llama_vision_patches & patches) { static int32_t llama_vision_encode_impl(llama_vision_context & ctx, const llama_vision_tokens & inp) {
int batch_size = patches.buf.size(); int batch_size = inp.buf.size();
auto & model = *ctx.model; auto & model = *ctx.model;
auto & hparams = ctx.model->hparams; auto & hparams = ctx.model->hparams;
@ -779,7 +796,7 @@ static int32_t clip_image_encode(clip_context & ctx, const llama_vision_patches
GGML_ASSERT(batch_size == 1); // TODO: support multiple images GGML_ASSERT(batch_size == 1); // TODO: support multiple images
} }
clip_image_size image_size{(int)hparams.image_size, (int)hparams.image_size}; img_size image_size{(int)hparams.image_size, (int)hparams.image_size};
const int patch_size = hparams.patch_size; const int patch_size = hparams.patch_size;
const int num_patches = ((image_size.width / patch_size) * (image_size.height / patch_size)); const int num_patches = ((image_size.width / patch_size) * (image_size.height / patch_size));
const int num_positions = num_patches + (model.class_embedding ? 1 : 0); const int num_positions = num_patches + (model.class_embedding ? 1 : 0);
@ -788,7 +805,7 @@ static int32_t clip_image_encode(clip_context & ctx, const llama_vision_patches
LLAMA_LOG_DEBUG("%s: num_positions = %d\n", __func__, num_positions); LLAMA_LOG_DEBUG("%s: num_positions = %d\n", __func__, num_positions);
// build the inference graph // build the inference graph
ggml_cgraph * gf = clip_image_build_graph(ctx, batch_size, image_size); ggml_cgraph * gf = llama_vision_build_graph(ctx, batch_size, image_size);
// alloc memory for graph // alloc memory for graph
bool ok = ggml_backend_sched_alloc_graph(ctx.sched, gf); bool ok = ggml_backend_sched_alloc_graph(ctx.sched, gf);
@ -803,15 +820,15 @@ static int32_t clip_image_encode(clip_context & ctx, const llama_vision_patches
float * data = (float *)malloc(ggml_nbytes(inp_raw)); float * data = (float *)malloc(ggml_nbytes(inp_raw));
for (int i = 0; i < batch_size; i++) { for (int i = 0; i < batch_size; i++) {
const int nx = patches.px * patches.n_px; const int nx = inp.px * inp.n_px;
const int ny = patches.py * patches.n_py; const int ny = inp.py * inp.n_py;
const int n = nx * ny; const int n = nx * ny;
for (int b = 0; b < batch_size; b++) { for (int b = 0; b < batch_size; b++) {
for (int k = 0; k < 3; k++) { for (int k = 0; k < 3; k++) {
for (int y = 0; y < ny; y++) { for (int y = 0; y < ny; y++) {
for (int x = 0; x < nx; x++) { for (int x = 0; x < nx; x++) {
data[(b * 3 * n) + k * n + y * nx + x] = patches.buf[b][3 * (y * nx + x) + k]; data[(b * 3 * n) + k * n + y * nx + x] = inp.buf[b][3 * (y * nx + x) + k];
} }
} }
} }
@ -891,34 +908,38 @@ void llama_vision_bitmap_free(llama_vision_bitmap * bmp) {
delete bmp; delete bmp;
} }
struct llama_vision_patches * llama_vision_patches_init( struct llama_vision_tokens * llama_vision_tokenize(
struct llama_context * ctx, struct llama_context * ctx,
llama_vision_bitmap * bmp) { llama_vision_bitmap * bmp) {
clip_context & vctx = ctx->vctx; llama_vision_context & vctx = ctx->vctx;
if (vctx.model->hparams.arch == LLM_ARCH_VISION_MINICPMV) { switch (vctx.model->hparams.arch) {
return new llama_vision_patches(clip_image_preprocess_minicpmv(vctx, *bmp)); case LLM_ARCH_VISION_LLAVA:
case LLM_ARCH_VISION_MOBILEVLM:
return new llama_vision_tokens(llama_vision_processor_llava(vctx).tokenize(*bmp));
case LLM_ARCH_VISION_MINICPMV:
return new llama_vision_tokens(llama_vision_processor_uhd(vctx).tokenize(*bmp));
default:
GGML_ASSERT(false && "unsupported arch");
} }
return new llama_vision_patches(clip_image_preprocess(vctx, *bmp));
} }
void llama_vision_patches_free(llama_vision_patches * p) { void llama_vision_tokens_free(llama_vision_tokens * p) {
delete p; delete p;
} }
int32_t llama_vision_encode(struct llama_context * ctx, llama_vision_patches * p) { int32_t llama_vision_encode(struct llama_context * ctx, llama_vision_tokens * p) {
if (p->buf.empty()) { if (p->buf.empty()) {
LLAMA_LOG_ERROR("%s: nothing to encode\n", __func__); LLAMA_LOG_ERROR("%s: nothing to encode\n", __func__);
return -1; return -1;
} }
clip_context & vctx = ctx->vctx; llama_vision_context & vctx = ctx->vctx;
auto & hparams = vctx.model->hparams; auto & hparams = vctx.model->hparams;
switch (hparams.mm_patch_merge_type) { switch (hparams.mm_patch_merge_type) {
case MM_PATCH_MERGE_FLAT: case MM_PATCH_MERGE_FLAT:
{ {
// flat / default llava-1.5 type embedding // flat / default llava-1.5 type embedding
// n_output = clip_n_patches(ctx); int32_t encoded = llama_vision_encode_impl(vctx, *p);
int32_t encoded = clip_image_encode(vctx, *p);
if (encoded != 0) { if (encoded != 0) {
LLAMA_LOG_ERROR("Unable to encode image\n"); LLAMA_LOG_ERROR("Unable to encode image\n");
return encoded; return encoded;
@ -944,7 +965,7 @@ struct ggml_tensor * llama_vision_get_output_tensor(llama_context * ctx) {
// for debugging // for debugging
#ifndef NDEBUG #ifndef NDEBUG
static int bmp_export(const struct clip_image_u8 &img, const std::string &location) { static int bmp_export(const struct llama_image_u8 &img, const std::string &location) {
const uint32_t width = img.nx; const uint32_t width = img.nx;
const uint32_t height = img.ny; const uint32_t height = img.ny;
// swap red and blue channel // swap red and blue channel

View File

@ -7,12 +7,12 @@
#include <vector> #include <vector>
#include <array> #include <array>
enum clip_projector_type { enum vision_projector_type {
CLIP_PROJECTOR_TYPE_UNKNOWN, VISION_PROJECTOR_TYPE_UNKNOWN,
CLIP_PROJECTOR_TYPE_MLP, VISION_PROJECTOR_TYPE_MLP,
CLIP_PROJECTOR_TYPE_LDPV2, VISION_PROJECTOR_TYPE_LDPV2,
CLIP_PROJECTOR_TYPE_MINICPMV_2_5, VISION_PROJECTOR_TYPE_MINICPMV_2_5,
CLIP_PROJECTOR_TYPE_MINICPMV_2_6, VISION_PROJECTOR_TYPE_MINICPMV_2_6,
}; };
enum mm_patch_merge { enum mm_patch_merge {
@ -21,7 +21,8 @@ enum mm_patch_merge {
MM_PATCH_MERGE_SPATIAL_UNPAD, MM_PATCH_MERGE_SPATIAL_UNPAD,
}; };
struct clip_hparams { struct llama_vision_model {
struct vision_hparams {
llm_arch arch = LLM_ARCH_UNKNOWN; llm_arch arch = LLM_ARCH_UNKNOWN;
uint32_t image_size; uint32_t image_size;
@ -37,7 +38,7 @@ struct clip_hparams {
float eps; float eps;
clip_projector_type proj_type = CLIP_PROJECTOR_TYPE_UNKNOWN; vision_projector_type proj_type = VISION_PROJECTOR_TYPE_UNKNOWN;
mm_patch_merge mm_patch_merge_type = MM_PATCH_MERGE_UNKNOWN; mm_patch_merge mm_patch_merge_type = MM_PATCH_MERGE_UNKNOWN;
std::array<float, 3> image_mean; std::array<float, 3> image_mean;
@ -45,9 +46,20 @@ struct clip_hparams {
std::array<int32_t, 32> image_grid_pinpoints; // TODO: should this be array of (x, y) pairs? std::array<int32_t, 32> image_grid_pinpoints; // TODO: should this be array of (x, y) pairs?
int32_t image_crop_resolution; int32_t image_crop_resolution;
}; };
struct vision_hparams hparams;
ggml_backend_buffer_type_t buft;
struct clip_layer { // embeddings
struct ggml_tensor * class_embedding = nullptr;
struct ggml_tensor * patch_embeddings = nullptr;
struct ggml_tensor * patch_bias = nullptr;
struct ggml_tensor * position_embeddings = nullptr;
struct ggml_tensor * pre_norm_w = nullptr;
struct ggml_tensor * pre_norm_b = nullptr;
struct vision_layer {
// attention // attention
struct ggml_tensor * k_w = nullptr; struct ggml_tensor * k_w = nullptr;
struct ggml_tensor * k_b = nullptr; struct ggml_tensor * k_b = nullptr;
@ -73,22 +85,8 @@ struct clip_layer {
// layernorm 2 // layernorm 2
struct ggml_tensor * norm_out_w = nullptr; struct ggml_tensor * norm_out_w = nullptr;
struct ggml_tensor * norm_out_b = nullptr; struct ggml_tensor * norm_out_b = nullptr;
}; };
std::vector<vision_layer> layers;
struct clip_vision_model {
struct clip_hparams hparams;
ggml_backend_buffer_type_t buft;
// embeddings
struct ggml_tensor * class_embedding = nullptr;
struct ggml_tensor * patch_embeddings = nullptr;
struct ggml_tensor * patch_bias = nullptr;
struct ggml_tensor * position_embeddings = nullptr;
struct ggml_tensor * pre_norm_w = nullptr;
struct ggml_tensor * pre_norm_b = nullptr;
std::vector<clip_layer> layers;
struct ggml_tensor * post_norm_w = nullptr; struct ggml_tensor * post_norm_w = nullptr;
struct ggml_tensor * post_norm_b = nullptr; struct ggml_tensor * post_norm_b = nullptr;
@ -132,13 +130,13 @@ struct clip_vision_model {
struct ggml_tensor * image_newline = nullptr; struct ggml_tensor * image_newline = nullptr;
}; };
struct clip_context { struct llama_vision_context {
// memory buffers used to evaluate the model // memory buffers used to evaluate the model
std::vector<uint8_t> buf_compute_meta; std::vector<uint8_t> buf_compute_meta;
ggml_backend_sched_t sched = nullptr; ggml_backend_sched_t sched = nullptr;
struct ggml_context * ctx_ggml = nullptr; struct ggml_context * ctx_ggml = nullptr;
const clip_vision_model * model; const llama_vision_model * model;
// temporary output data, to be picked up by llama_decode() // temporary output data, to be picked up by llama_decode()
struct ggml_tensor * output; struct ggml_tensor * output;
@ -147,7 +145,7 @@ struct clip_context {
// for now, this only contains: // for now, this only contains:
// - the instruction for ggml_conv_2d to break the image into patches // - the instruction for ggml_conv_2d to break the image into patches
// - the pre-processed image data in f32 // - the pre-processed image data in f32
struct llama_vision_patches { struct llama_vision_tokens {
uint32_t px; // size of patch uint32_t px; // size of patch
uint32_t py; // size of patch uint32_t py; // size of patch
size_t n_px; // number of patches in x direction size_t n_px; // number of patches in x direction
@ -166,20 +164,20 @@ inline mm_patch_merge mm_patch_merge_from_name(std::string & name) {
return MM_PATCH_MERGE_UNKNOWN; return MM_PATCH_MERGE_UNKNOWN;
} }
inline clip_projector_type clip_projector_type_from_name(std::string & name) { inline vision_projector_type vision_projector_type_from_name(std::string & name) {
if (name == "mlp") { if (name == "mlp") {
return CLIP_PROJECTOR_TYPE_MLP; return VISION_PROJECTOR_TYPE_MLP;
} else if (name == "ldpv2") { } else if (name == "ldpv2") {
return CLIP_PROJECTOR_TYPE_LDPV2; return VISION_PROJECTOR_TYPE_LDPV2;
} else if (name == "minicpmv-2.5") { } else if (name == "minicpmv-2.5") {
return CLIP_PROJECTOR_TYPE_MINICPMV_2_5; return VISION_PROJECTOR_TYPE_MINICPMV_2_5;
} else if (name == "minicpmv-2.6") { } else if (name == "minicpmv-2.6") {
return CLIP_PROJECTOR_TYPE_MINICPMV_2_6; return VISION_PROJECTOR_TYPE_MINICPMV_2_6;
} }
return CLIP_PROJECTOR_TYPE_UNKNOWN; return VISION_PROJECTOR_TYPE_UNKNOWN;
} }
// only for sanity check: must be equal to n_embd of language model // only for sanity check: must be equal to n_embd of language model
uint32_t clip_n_mmproj_embd(const clip_vision_model & clip_model); uint32_t llama_vision_n_mmproj_embd(const llama_vision_model & vmodel);
struct ggml_tensor * llama_vision_get_output_tensor(llama_context * ctx); struct ggml_tensor * llama_vision_get_output_tensor(llama_context * ctx);