diff --git a/.gitignore b/.gitignore index 50ae0973a..1ed59c80e 100644 --- a/.gitignore +++ b/.gitignore @@ -60,6 +60,8 @@ models-mnt /libllama.so /llama-bench /llava-cli +/minicpmv-cli +/openbmb /lookahead /lookup /lookup-create diff --git a/Makefile b/Makefile index 5caf31cdf..918417c77 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # Define the default target now so that it is always the first target BUILD_TARGETS = \ main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \ - simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search \ + simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli minicpmv-cli baby-llama beam-search \ retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o # Binaries only useful for tests @@ -862,6 +862,13 @@ llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/cli $(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp) $(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS) +minicpmv-cli: examples/minicpmv/minicpmv-cli.cpp examples/minicpmv/clip.h examples/minicpmv/clip.cpp examples/minicpmv/minicpmv.h examples/minicpmv/minicpmv.cpp examples/minicpmv/minicpmv_wrapper.h examples/minicpmv/minicpmv_wrapper.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) -c examples/minicpmv/clip.cpp -o $(call GET_OBJ_FILE, examples/minicpmv/clip.cpp) -Wno-cast-qual + $(CXX) $(CXXFLAGS) -c examples/minicpmv/minicpmv.cpp -o $(call GET_OBJ_FILE, examples/minicpmv/minicpmv.cpp) + $(CXX) $(CXXFLAGS) -c examples/minicpmv/minicpmv_wrapper.cpp -o $(call GET_OBJ_FILE, examples/minicpmv/minicpmv_wrapper.cpp) + $(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/minicpmv/clip.cpp examples/minicpmv/minicpmv.cpp examples/minicpmv/minicpmv_wrapper.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/minicpmv/clip.cpp) $(call GET_OBJ_FILE, examples/minicpmv/minicpmv.cpp) $(call GET_OBJ_FILE, examples/minicpmv/minicpmv_wrapper.cpp) -o $@ $(LDFLAGS) + baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 1b060e4e6..81f80d077 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -675,44 +675,6 @@ class GPTNeoXModel(Model): self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True)) self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) - n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) - - tensors: list[tuple[str, Tensor]] = [] - - if re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.weight", name): - # Map bloom-style qkv_linear to gpt-style qkv_linear - # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa - # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa - qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed)) - data_torch = torch.cat( - ( - qkv_weights[:, 0, :, :].reshape((-1, n_embed)), - qkv_weights[:, 1, :, :].reshape((-1, n_embed)), - qkv_weights[:, 2, :, :].reshape((-1, n_embed)), - ), - dim=0, - ) - logger.info("re-format attention.linear_qkv.weight") - elif re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.bias", name): - qkv_bias = data_torch.reshape((n_head, 3, n_embed // n_head)) - data_torch = torch.cat( - ( - qkv_bias[:, 0, :].reshape((n_embed,)), - qkv_bias[:, 1, :].reshape((n_embed,)), - qkv_bias[:, 2, :].reshape((n_embed,)), - ), - dim=0, - ) - logger.info("re-format attention.linear_qkv.bias") - - tensors.append((self.map_tensor_name(name), data_torch)) - - return tensors - @Model.register("BloomForCausalLM") class BloomModel(Model): diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index b40ee4ccb..d8c95f36d 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -26,6 +26,7 @@ else() add_subdirectory(infill) add_subdirectory(llama-bench) add_subdirectory(llava) + add_subdirectory(minicpmv) if (LLAMA_SYCL) add_subdirectory(sycl) endif() diff --git a/examples/minicpmv/CMakeLists.txt b/examples/minicpmv/CMakeLists.txt new file mode 100644 index 000000000..7f6f3f693 --- /dev/null +++ b/examples/minicpmv/CMakeLists.txt @@ -0,0 +1,42 @@ +add_library(minicpmv OBJECT + minicpmv.cpp + minicpmv.h + clip.cpp + clip.h + ) + +target_link_libraries(minicpmv PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT}) + +target_include_directories(minicpmv PUBLIC .) +target_include_directories(minicpmv PUBLIC ../..) +target_include_directories(minicpmv PUBLIC ../../common) + +target_compile_features(minicpmv PRIVATE cxx_std_11) + +add_library(minicpmv_static STATIC $) +if (BUILD_SHARED_LIBS) + set_target_properties(minicpmv PROPERTIES POSITION_INDEPENDENT_CODE ON) + target_compile_definitions(minicpmv PRIVATE LLAMA_SHARED LLAMA_BUILD) + add_library(minicpmv_shared SHARED $) + target_link_libraries(minicpmv_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT}) + install(TARGETS minicpmv_shared LIBRARY) +endif() + +if (NOT MSVC) + target_compile_options(minicpmv PRIVATE -Wno-cast-qual) # stb_image.h +endif() + +if(TARGET BUILD_INFO) + add_dependencies(minicpmv BUILD_INFO) +endif() + +set(TARGET minicpmv-cli) +add_executable(minicpmv-cli minicpmv-cli.cpp) +install(TARGETS minicpmv-cli RUNTIME) +target_link_libraries(minicpmv-cli PRIVATE common minicpmv_wrapper minicpmv ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(minicpmv PRIVATE cxx_std_11) + +add_library(minicpmv_wrapper OBJECT + minicpmv_wrapper.cpp +) +target_link_libraries(minicpmv_wrapper PRIVATE minicpmv ${CMAKE_THREAD_LIBS_INIT}) \ No newline at end of file diff --git a/examples/minicpmv/README.md b/examples/minicpmv/README.md new file mode 100644 index 000000000..b661b2646 --- /dev/null +++ b/examples/minicpmv/README.md @@ -0,0 +1,104 @@ +## MiniCPM-Llama3-V 2.5 + +### Usage + +Download [MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5) PyTorch model from huggingface to "MiniCPM-Llama3-V-2_5" folder. + +Clone llama.cpp and checkout to branch `minicpm-v2.5`: +```bash +git clone -b minicpm-v2.5 https://github.com/OpenBMB/llama.cpp.git +cd llama.cpp +``` + +Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us) + +```bash +python ./examples/minicpmv/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5 +python ./examples/minicpmv/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 +python ./convert.py ../MiniCPM-Llama3-V-2_5/model --outtype f16 --vocab-type bpe + +# quantize int4 version +./quantize ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf Q4_K_M +``` + +Build for Linux or Mac + +```bash +make +make minicpmv-cli +``` + +Inference on Linux or Mac +``` +# run f16 version +./minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" + +# run quantized int4 version +./minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" + +# or run in interactive mode +./minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i +``` + +### Android + +#### Build on Android device using Termux +We found that build on Android device would bring better runtime performance, so we recommend to build on device. + +[Termux](https://github.com/termux/termux-app#installation) is a terminal app on Android device (no root required). + +Install tools in Termux: +``` +apt update && apt upgrade -y +apt install git make cmake +``` + +It's recommended to move your model inside the `~/` directory for best performance: +``` +cd storage/downloads +mv model.gguf ~/ +``` + +#### Building the Project using Android NDK +Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake. + +Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux: + +```bash +mkdir build-android +cd build-android +export NDK=/your_ndk_path +cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod .. +make +``` + +Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice). + +Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission: + +(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`) +``` +$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/ +$cd /data/data/com.termux/files/home/bin +$chmod +x ./* +``` + +Download models and push them to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/` + +``` +$mv /sdcard/llama.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/ +$mv /sdcard/llama.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/ +``` + +Now, you can start chatting: +``` +$cd /data/data/com.termux/files/home/bin +$./minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" +``` + +### result +We use this command on Xiaomi 14 Pro, and the measured results. +``` +$./minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 -t 6 --image xx.jpg -p "What is in the image?" +``` +![alt text](assets/xiaomi14pro_test.jpeg) \ No newline at end of file diff --git a/examples/minicpmv/assets/xiaomi14pro_test.jpeg b/examples/minicpmv/assets/xiaomi14pro_test.jpeg new file mode 100644 index 000000000..8762c9c7b Binary files /dev/null and b/examples/minicpmv/assets/xiaomi14pro_test.jpeg differ diff --git a/examples/minicpmv/clip.cpp b/examples/minicpmv/clip.cpp new file mode 100644 index 000000000..a0523fd94 --- /dev/null +++ b/examples/minicpmv/clip.cpp @@ -0,0 +1,1870 @@ +#include "clip.h" +#include "common.h" +#include "log.h" +#include "ggml.h" +#include "ggml-alloc.h" +#include "ggml-backend.h" + +#ifdef GGML_USE_CUDA +#include "ggml-cuda.h" +#endif + +#ifdef GGML_USE_METAL +#include "ggml-metal.h" +#endif + +#define STB_IMAGE_IMPLEMENTATION +#include "stb_image.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +//#define CLIP_DEBUG_FUNCTIONS + +// RGB uint8 image +struct clip_image_u8 { + int nx; + int ny; + + std::vector buf; +}; + +// RGB float32 image (NHWC) +// Memory layout: RGBRGBRGB... +struct clip_image_f32 { + int nx; + int ny; + + std::vector buf; +}; + +static std::string format(const char * fmt, ...) { + va_list ap; + va_list ap2; + va_start(ap, fmt); + va_copy(ap2, ap); + int size = vsnprintf(NULL, 0, fmt, ap); + GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT + std::vector buf(size + 1); + int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); + GGML_ASSERT(size2 == size); + va_end(ap2); + va_end(ap); + return std::string(buf.data(), buf.size()); +} + +// +// key constants +// + +#define KEY_FTYPE "general.file_type" +#define KEY_NAME "general.name" +#define KEY_DESCRIPTION "general.description" +#define KEY_HAS_TEXT_ENC "clip.has_text_encoder" +#define KEY_HAS_VIS_ENC "clip.has_vision_encoder" +#define KEY_HAS_LLAVA_PROJ "clip.has_minicpmv_projector" +#define KEY_USE_GELU "clip.use_gelu" +#define KEY_N_EMBD "clip.%s.embedding_length" +#define KEY_N_FF "clip.%s.feed_forward_length" +#define KEY_N_BLOCK "clip.%s.block_count" +#define KEY_N_HEAD "clip.%s.attention.head_count" +#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon" +#define KEY_PROJ_DIM "clip.%s.projection_dim" +#define KEY_TOKENS "tokenizer.ggml.tokens" +#define KEY_N_POSITIONS "clip.text.context_length" +#define KEY_IMAGE_SIZE "clip.vision.image_size" +#define KEY_PATCH_SIZE "clip.vision.patch_size" +#define KEY_IMAGE_MEAN "clip.vision.image_mean" +#define KEY_IMAGE_STD "clip.vision.image_std" +#define KEY_PROJ_TYPE "clip.projector_type" + +#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type" +#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints" +#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution" + + +// +// tensor name constants +// + +#define TN_TOKEN_EMBD "%s.token_embd.weight" +// #define TN_POS_EMBD "%s.position_embd" +// // #define TN_CLASS_EMBD "v.class_embd" +// #define TN_PATCH_EMBD "v.patch_embd.proj.%s" +#define TN_POS_EMBD "%s.position_embd.weight" +// #define TN_CLASS_EMBD "v.class_embd" +#define TN_PATCH_EMBD "v.patch_embd.%s" +#define TN_ATTN_K "%s.blk.%d.attn_k.%s" +#define TN_ATTN_Q "%s.blk.%d.attn_q.%s" +#define TN_ATTN_V "%s.blk.%d.attn_v.%s" +#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s" +#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s" +#define TN_FFN_UP "%s.blk.%d.ffn_up.%s" +#define TN_LN_1 "%s.blk.%d.ln1.%s" +#define TN_LN_2 "%s.blk.%d.ln2.%s" +// #define TN_LN_PRE "%s.pre_ln.%s" +#define TN_LN_POST "%s.post_ln.%s" +#define TN_TEXT_PROJ "text_projection.weight" +#define TN_VIS_PROJ "visual_projection.weight" +#define TN_LLAVA_PROJ "mm.%d.%s" +#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s" +#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s" +#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s" +#define TN_IMAGE_NEWLINE "model.image_newline" +// MINICPMV +// #define TN_MINICPMV_POS_EMBD "resampler.pos_embed" +#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k" +#define TN_MINICPMV_QUERY "resampler.query" +#define TN_MINICPMV_PROJ "resampler.proj.weight" +#define TN_MINICPMV_KV_PROJ "resampler.kv.weight" +#define TN_MINICPMV_ATTN "resampler.attn.%s.%s" +#define TN_MINICPMV_LN "resampler.ln_%s.%s" + + +enum projector_type { + PROJECTOR_TYPE_MLP, + PROJECTOR_TYPE_MLP_NORM, + PROJECTOR_TYPE_LDP, + PROJECTOR_TYPE_LDPV2, + PROJECTOR_TYPE_RESAMPLER, + PROJECTOR_TYPE_UNKNOWN, +}; + +static std::map PROJECTOR_TYPE_NAMES = { + { PROJECTOR_TYPE_MLP, "mlp" }, + { PROJECTOR_TYPE_LDP, "ldp" }, + { PROJECTOR_TYPE_LDPV2, "ldpv2"}, + { PROJECTOR_TYPE_RESAMPLER, "resampler"}, +}; + + +// +// utilities to get data from a gguf file +// + +static int get_key_idx(const gguf_context * ctx, const char * key) { + int i = gguf_find_key(ctx, key); + if (i == -1) { + LOG_TEE("key %s not found in file\n", key); + throw std::runtime_error(format("Missing required key: %s", key)); + } + + return i; +} + +static uint32_t get_u32(const gguf_context * ctx, const std::string & key) { + const int i = get_key_idx(ctx, key.c_str()); + + return gguf_get_val_u32(ctx, i); +} + +static float get_f32(const gguf_context * ctx, const std::string & key) { + const int i = get_key_idx(ctx, key.c_str()); + + return gguf_get_val_f32(ctx, i); +} + +static struct ggml_tensor * get_tensor(struct ggml_context * ctx, const std::string & name) { + struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str()); + if (!cur) { + throw std::runtime_error(format("%s: unable to find tensor %s\n", __func__, name.c_str())); + } + + return cur; +} + +static std::string get_ftype(int ftype) { + return ggml_type_name(static_cast(ftype)); +} + +static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) { + switch (type) { + case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]); + case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]); + case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]); + case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]); + case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]); + case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]); + case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]); + case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]); + case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]); + case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]); + case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false"; + default: return format("unknown type %d", type); + } +} + +static void replace_all(std::string & s, const std::string & search, const std::string & replace) { + std::string result; + for (size_t pos = 0; ; pos += search.length()) { + auto new_pos = s.find(search, pos); + if (new_pos == std::string::npos) { + result += s.substr(pos, s.size() - pos); + break; + } + result += s.substr(pos, new_pos - pos) + replace; + pos = new_pos; + } + s = std::move(result); +} + +static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { + const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i); + + switch (type) { + case GGUF_TYPE_STRING: + return gguf_get_val_str(ctx_gguf, i); + case GGUF_TYPE_ARRAY: + { + const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i); + int arr_n = gguf_get_arr_n(ctx_gguf, i); + const void * data = gguf_get_arr_data(ctx_gguf, i); + std::stringstream ss; + ss << "["; + for (int j = 0; j < arr_n; j++) { + if (arr_type == GGUF_TYPE_STRING) { + std::string val = gguf_get_arr_str(ctx_gguf, i, j); + // escape quotes + replace_all(val, "\\", "\\\\"); + replace_all(val, "\"", "\\\""); + ss << '"' << val << '"'; + } else if (arr_type == GGUF_TYPE_ARRAY) { + ss << "???"; + } else { + ss << gguf_data_to_str(arr_type, data, j); + } + if (j < arr_n - 1) { + ss << ", "; + } + } + ss << "]"; + return ss.str(); + } + default: + return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0); + } +} + +static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") { + size_t tensor_size = ggml_nbytes(tensor); + LOG_TEE("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n", + prefix, ggml_n_dims(tensor), tensor->name, tensor_size, + tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type)); +} + +static projector_type clip_projector_type_from_string(const std::string & name) { + for (const auto & kv : PROJECTOR_TYPE_NAMES) { // NOLINT + if (kv.second == name) { + return kv.first; + } + } + return PROJECTOR_TYPE_UNKNOWN; +} + +#ifdef CLIP_DEBUG_FUNCTIONS +static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) { + std::ofstream file(filename, std::ios::binary); + if (!file.is_open()) { + LOG_TEE("Failed to open file for writing: %s\n", filename.c_str()); + return; + } + + // PPM header: P6 format, width, height, and max color value + file << "P6\n" << img.nx << " " << img.ny << "\n255\n"; + + // Write pixel data + for (size_t i = 0; i < img.buf.size(); i += 3) { + // PPM expects binary data in RGB format, which matches our image buffer + file.write(reinterpret_cast(&img.buf[i]), 3); + } + + file.close(); +} + +static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) { + std::ofstream file(filename, std::ios::binary); + if (!file.is_open()) { + LOG_TEE("Failed to open file for writing: %s\n", filename.c_str()); + return; + } + + int fileSize = 54 + 3 * img.nx * img.ny; // File header + info header + pixel data + int bytesPerPixel = 3; + int widthInBytes = img.nx * bytesPerPixel; + int paddingAmount = (4 - (widthInBytes % 4)) % 4; + int stride = widthInBytes + paddingAmount; + + // Bitmap file header + unsigned char fileHeader[14] = { + 'B','M', // Signature + 0,0,0,0, // Image file size in bytes + 0,0,0,0, // Reserved + 54,0,0,0 // Start of pixel array + }; + + // Total file size + fileSize = 54 + (stride * img.ny); + fileHeader[2] = (unsigned char)(fileSize); + fileHeader[3] = (unsigned char)(fileSize >> 8); + fileHeader[4] = (unsigned char)(fileSize >> 16); + fileHeader[5] = (unsigned char)(fileSize >> 24); + + // Bitmap information header (BITMAPINFOHEADER) + unsigned char infoHeader[40] = { + 40,0,0,0, // Size of this header (40 bytes) + 0,0,0,0, // Image width + 0,0,0,0, // Image height + 1,0, // Number of color planes + 24,0, // Bits per pixel + 0,0,0,0, // No compression + 0,0,0,0, // Image size (can be 0 for no compression) + 0,0,0,0, // X pixels per meter (not specified) + 0,0,0,0, // Y pixels per meter (not specified) + 0,0,0,0, // Total colors (color table not used) + 0,0,0,0 // Important colors (all are important) + }; + + // Width and height in the information header + infoHeader[4] = (unsigned char)(img.nx); + infoHeader[5] = (unsigned char)(img.nx >> 8); + infoHeader[6] = (unsigned char)(img.nx >> 16); + infoHeader[7] = (unsigned char)(img.nx >> 24); + infoHeader[8] = (unsigned char)(img.ny); + infoHeader[9] = (unsigned char)(img.ny >> 8); + infoHeader[10] = (unsigned char)(img.ny >> 16); + infoHeader[11] = (unsigned char)(img.ny >> 24); + + // Write file headers + file.write(reinterpret_cast(fileHeader), sizeof(fileHeader)); + file.write(reinterpret_cast(infoHeader), sizeof(infoHeader)); + + // Pixel data + std::vector padding(3, 0); // Max padding size to be added to each row + for (int y = img.ny - 1; y >= 0; --y) { // BMP files are stored bottom-to-top + for (int x = 0; x < img.nx; ++x) { + // Each pixel + size_t pixelIndex = (y * img.nx + x) * 3; + unsigned char pixel[3] = { + img.buf[pixelIndex + 2], // BMP stores pixels in BGR format + img.buf[pixelIndex + 1], + img.buf[pixelIndex] + }; + file.write(reinterpret_cast(pixel), 3); + } + // Write padding for the row + file.write(reinterpret_cast(padding.data()), paddingAmount); + } + + file.close(); +} + +// debug function to convert f32 to u8 +static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) { + dst.nx = src.nx; + dst.ny = src.ny; + dst.buf.resize(3 * src.nx * src.ny); + for (size_t i = 0; i < src.buf.size(); ++i) { + dst.buf[i] = static_cast(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255)); + } +} +#endif + + +// +// clip layers +// + +struct clip_hparams { + int32_t image_size; + int32_t patch_size; + int32_t hidden_size; + int32_t n_intermediate; + int32_t projection_dim; + int32_t n_head; + int32_t n_layer; + + float eps; + + char mm_patch_merge_type[32] = "flat"; // spatial_unpad or flat (default) + + int32_t image_grid_pinpoints[32]; + int32_t image_crop_resolution; +}; + +struct clip_layer { + // attention + struct ggml_tensor * k_w; + struct ggml_tensor * k_b; + struct ggml_tensor * q_w; + struct ggml_tensor * q_b; + struct ggml_tensor * v_w; + struct ggml_tensor * v_b; + + struct ggml_tensor * o_w; + struct ggml_tensor * o_b; + + // layernorm 1 + struct ggml_tensor * ln_1_w; + struct ggml_tensor * ln_1_b; + + // ff + struct ggml_tensor * ff_i_w; + struct ggml_tensor * ff_i_b; + + struct ggml_tensor * ff_o_w; + struct ggml_tensor * ff_o_b; + + // layernorm 2 + struct ggml_tensor * ln_2_w; + struct ggml_tensor * ln_2_b; +}; + +struct clip_vision_model { + struct clip_hparams hparams; + + // embeddings + // struct ggml_tensor * class_embedding; + struct ggml_tensor * patch_embeddings_w; + struct ggml_tensor * patch_embeddings_b; + struct ggml_tensor * position_embeddings; + + // struct ggml_tensor * pre_ln_w; + // struct ggml_tensor * pre_ln_b; + + std::vector layers; + + struct ggml_tensor * post_ln_w; + struct ggml_tensor * post_ln_b; + + struct ggml_tensor * projection; + + // LLaVA projection + struct ggml_tensor * mm_0_w = NULL; + struct ggml_tensor * mm_0_b = NULL; + struct ggml_tensor * mm_2_w = NULL; + struct ggml_tensor * mm_2_b = NULL; + + struct ggml_tensor * image_newline = NULL; + + // Yi type models with mlp+normalization projection + struct ggml_tensor * mm_1_w = NULL; // Yi type models have 0, 1, 3, 4 + struct ggml_tensor * mm_1_b = NULL; + struct ggml_tensor * mm_3_w = NULL; + struct ggml_tensor * mm_3_b = NULL; + struct ggml_tensor * mm_4_w = NULL; + struct ggml_tensor * mm_4_b = NULL; + + // MobileVLM projection + struct ggml_tensor * mm_model_mlp_1_w; + struct ggml_tensor * mm_model_mlp_1_b; + struct ggml_tensor * mm_model_mlp_3_w; + struct ggml_tensor * mm_model_mlp_3_b; + struct ggml_tensor * mm_model_block_1_block_0_0_w; + struct ggml_tensor * mm_model_block_1_block_0_1_w; + struct ggml_tensor * mm_model_block_1_block_0_1_b; + struct ggml_tensor * mm_model_block_1_block_1_fc1_w; + struct ggml_tensor * mm_model_block_1_block_1_fc1_b; + struct ggml_tensor * mm_model_block_1_block_1_fc2_w; + struct ggml_tensor * mm_model_block_1_block_1_fc2_b; + struct ggml_tensor * mm_model_block_1_block_2_0_w; + struct ggml_tensor * mm_model_block_1_block_2_1_w; + struct ggml_tensor * mm_model_block_1_block_2_1_b; + struct ggml_tensor * mm_model_block_2_block_0_0_w; + struct ggml_tensor * mm_model_block_2_block_0_1_w; + struct ggml_tensor * mm_model_block_2_block_0_1_b; + struct ggml_tensor * mm_model_block_2_block_1_fc1_w; + struct ggml_tensor * mm_model_block_2_block_1_fc1_b; + struct ggml_tensor * mm_model_block_2_block_1_fc2_w; + struct ggml_tensor * mm_model_block_2_block_1_fc2_b; + struct ggml_tensor * mm_model_block_2_block_2_0_w; + struct ggml_tensor * mm_model_block_2_block_2_1_w; + struct ggml_tensor * mm_model_block_2_block_2_1_b; + + // MobileVLM_V2 projection + struct ggml_tensor * mm_model_mlp_0_w; + struct ggml_tensor * mm_model_mlp_0_b; + struct ggml_tensor * mm_model_mlp_2_w; + struct ggml_tensor * mm_model_mlp_2_b; + struct ggml_tensor * mm_model_peg_0_w; + struct ggml_tensor * mm_model_peg_0_b; + + // MINICPMV projection + // struct ggml_tensor * mm_model_pos_embed; + struct ggml_tensor * mm_model_pos_embed_k; + struct ggml_tensor * mm_model_query; + struct ggml_tensor * mm_model_proj; + struct ggml_tensor * mm_model_kv_proj; + struct ggml_tensor * mm_model_attn_q_w; + struct ggml_tensor * mm_model_attn_q_b; + struct ggml_tensor * mm_model_attn_k_w; + struct ggml_tensor * mm_model_attn_k_b; + struct ggml_tensor * mm_model_attn_v_w; + struct ggml_tensor * mm_model_attn_v_b; + struct ggml_tensor * mm_model_attn_o_w; + struct ggml_tensor * mm_model_attn_o_b; + struct ggml_tensor * mm_model_ln_q_w; + struct ggml_tensor * mm_model_ln_q_b; + struct ggml_tensor * mm_model_ln_kv_w; + struct ggml_tensor * mm_model_ln_kv_b; + struct ggml_tensor * mm_model_ln_post_w; + struct ggml_tensor * mm_model_ln_post_b; +}; + +struct clip_ctx { + bool has_text_encoder = false; + bool has_vision_encoder = false; + bool has_minicpmv_projector = false; + + struct clip_vision_model vision_model; + projector_type proj_type = PROJECTOR_TYPE_MLP; + + float image_mean[3]; + float image_std[3]; + bool use_gelu = false; + int32_t ftype = 1; + + struct gguf_context * ctx_gguf; + struct ggml_context * ctx_data; + + std::vector buf_compute_meta; + + // memory buffers to evaluate the model + ggml_backend_buffer_t params_buffer = NULL; + + ggml_backend_t backend = NULL; + ggml_gallocr_t compute_alloc = NULL; +}; + +static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, std::pair load_image_size = {448, 448}) { + if (!ctx->has_vision_encoder) { + LOG_TEE("This gguf file seems to have no vision encoder\n"); + return nullptr; + } + + const auto & model = ctx->vision_model; + const auto & hparams = model.hparams; + + const int image_size_width = load_image_size.first; + const int image_size_height = load_image_size.second; + const int patch_size = hparams.patch_size; + const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); + const int num_positions = num_patches; + const int hidden_size = hparams.hidden_size; + const int n_head = hparams.n_head; + const int d_head = hidden_size / n_head; + const int n_layer = hparams.n_layer; + const float eps = hparams.eps; + + const int batch_size = imgs->size; + + if (ctx->has_minicpmv_projector) { + GGML_ASSERT(batch_size == 1); + } + + struct ggml_init_params params = { + /*.mem_size =*/ ctx->buf_compute_meta.size(), + /*.mem_buffer =*/ ctx->buf_compute_meta.data(), + /*.no_alloc =*/ true, + }; + + LOG_TEE("%s: ctx->buf_compute_meta.size(): %d \n", __func__, ctx->buf_compute_meta.size()); + struct ggml_context * ctx0 = ggml_init(params); + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + + LOG_TEE("%s: load_image_size: %d %d\n", __func__, load_image_size.first, load_image_size.second); + struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3, batch_size); + ggml_set_name(inp_raw, "inp_raw"); + ggml_set_input(inp_raw); + + struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_w, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + + inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size); + inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); + inp = ggml_add(ctx0, inp, model.patch_embeddings_b); + + // concat class_embeddings and patch_embeddings + // struct ggml_tensor * embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); + // ggml_set_name(embeddings, "embeddings"); + // ggml_set_input(embeddings); + + // embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, + // embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); + + // embeddings = ggml_acc(ctx0, embeddings, inp, + // embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); + + struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions); + ggml_set_name(positions, "positions"); + ggml_set_input(positions); + + struct ggml_tensor * embeddings = + ggml_add(ctx0, inp, ggml_get_rows(ctx0, model.position_embeddings, positions)); + + int pos_w = image_size_width/patch_size; + int pos_h = image_size_height/patch_size; + struct ggml_tensor * pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1); + ggml_set_name(pos_embed, "pos_embed"); + ggml_set_input(pos_embed); + + // // pre-layernorm + // { + // embeddings = ggml_norm(ctx0, embeddings, eps); + // ggml_set_name(embeddings, "pre_ln"); + + // embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b); + // } + + // loop over layers + for (int il = 0; il < n_layer; il++) { + struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states + + //const size_t nb_q_w = model.layers[il].q_w->nb[0]; + + // layernorm1 + { + cur = ggml_norm(ctx0, cur, eps); + + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_1_w), + model.layers[il].ln_1_b); + } + + // self-attention + { + + struct ggml_tensor * Q = + ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b); + + Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head)); + Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size); + Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); + Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size); + + struct ggml_tensor * K = + ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b); + + K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size); + K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); + K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size); + + struct ggml_tensor * V = + ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].v_w, cur), model.layers[il].v_b); + + V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size); + V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); + V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size); + + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + KQ = ggml_soft_max_inplace(ctx0, KQ); + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ); + KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size); + KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + + cur = ggml_cont_3d(ctx0, KQV, hidden_size, num_positions, batch_size); + } + + // attention output + cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].o_w, cur), model.layers[il].o_b); + + // re-add the layer input, e.g., residual + cur = ggml_add(ctx0, cur, embeddings); + + embeddings = cur; // embeddings = residual, cur = hidden_states + + // layernorm2 + { + cur = ggml_norm(ctx0, cur, eps); + + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_2_w), model.layers[il].ln_2_b); + } + + cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur); + cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b); + + if (ctx->use_gelu) { + cur = ggml_gelu_inplace(ctx0, cur); + } else { + cur = ggml_gelu_quick_inplace(ctx0, cur); + } + + cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur); + cur = ggml_add(ctx0, cur, model.layers[il].ff_o_b); + + // residual 2 + cur = ggml_add(ctx0, embeddings, cur); + + embeddings = cur; + } + + { // post layernorm + embeddings = ggml_norm(ctx0, embeddings, eps); + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b); + } + + // llava projector + { + // embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]); + + // struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, 64); + // ggml_set_name(patches, "patches"); + // ggml_set_input(patches); + + // shape [1, 576, 1024] + // ne is whcn, ne = [1024, 576, 1, 1] + // embeddings = ggml_get_rows(ctx0, embeddings, patches); + + // print_tensor_info(embeddings, "embeddings"); + + // llava projector + if (ctx->proj_type == PROJECTOR_TYPE_MLP) { + embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); + + embeddings = ggml_gelu(ctx0, embeddings); + embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); + + } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { + embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); + // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false); + // First LayerNorm + embeddings = ggml_norm(ctx0, embeddings, eps); + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w), + model.mm_1_b); + + // GELU activation + embeddings = ggml_gelu(ctx0, embeddings); + + // Second linear layer + embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_3_b); + + // Second LayerNorm + embeddings = ggml_norm(ctx0, embeddings, eps); + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w), + model.mm_4_b); + } + else if (ctx->proj_type == PROJECTOR_TYPE_LDP) { + // MobileVLM projector + int n_patch = 24; + struct ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings); + mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b); + mlp_1 = ggml_gelu(ctx0, mlp_1); + struct ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1); + mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b); + // mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1] + + // block 1 + struct ggml_tensor * block_1 = nullptr; + { + // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24] + mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3)); + mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]); + // stride = 1, padding = 1, bias is nullptr + block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1); + + // layer norm + // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3)); + // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1] + block_1 = ggml_norm(ctx0, block_1, eps); + block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3)); + + // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] + // hardswish + struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1); + + block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0); + // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1] + // pointwise conv + block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]); + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1); + block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b); + block_1 = ggml_relu(ctx0, block_1); + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1); + block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b); + block_1 = ggml_hardsigmoid(ctx0, block_1); + // block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1] + block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]); + block_1 = ggml_mul(ctx0, block_1_hw, block_1); + + int w = block_1->ne[0], h = block_1->ne[1]; + block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3)); + + // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1] + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1); + block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]); + + // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1] + block_1 = ggml_norm(ctx0, block_1, eps); + block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3)); + // block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] + // residual + block_1 = ggml_add(ctx0, mlp_3, block_1); + } + + // block_2 + { + // stride = 2 + block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1); + + // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1] + // layer norm + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3)); + // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1] + block_1 = ggml_norm(ctx0, block_1, eps); + block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3)); + // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1] + // hardswish + struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1); + + // not sure the parameters is right for globalAvgPooling + block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0); + // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1] + // pointwise conv + block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]); + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1); + block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b); + block_1 = ggml_relu(ctx0, block_1); + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1); + block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b); + block_1 = ggml_hardsigmoid(ctx0, block_1); + + // block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1] + block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]); + block_1 = ggml_mul(ctx0, block_1_hw, block_1); + + int w = block_1->ne[0], h = block_1->ne[1]; + block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3)); + // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1] + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1); + block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]); + + + // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1] + block_1 = ggml_norm(ctx0, block_1, eps); + block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b); + block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]); + // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1] + } + embeddings = block_1; + } + else if (ctx->proj_type == PROJECTOR_TYPE_LDPV2) + { + int n_patch = 24; + struct ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings); + mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b); + mlp_0 = ggml_gelu(ctx0, mlp_0); + struct ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0); + mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b); + // mlp_2 ne = [2048, 576, 1, 1] + // // AVG Pool Layer 2*2, strides = 2 + mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 0, 2, 3)); + // mlp_2 ne = [576, 2048, 1, 1] + mlp_2 = ggml_reshape_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]); + // mlp_2 ne [24, 24, 2048, 1] + mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0); + // weight ne = [3, 3, 2048, 1] + struct ggml_tensor * peg_0 = ggml_conv_depthwise_2d(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1); + peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3)); + peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b); + mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3)); + peg_0 = ggml_add(ctx0, peg_0, mlp_2); + peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]); + embeddings = peg_0; + } + else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) { + struct ggml_tensor * q = model.mm_model_query; + { // layernorm + q = ggml_norm(ctx0, q, eps); + q = ggml_add(ctx0, ggml_mul(ctx0, q, model.mm_model_ln_q_w), model.mm_model_ln_q_b); + } + struct ggml_tensor *k, *v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings); + { // layernorm + v = ggml_norm(ctx0, v, eps); + v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b); + } + { // position + // q = ggml_add(ctx0, q, model.mm_model_pos_embed); + k = ggml_add(ctx0, v, pos_embed); + } + + { // attention + const int hidden_size = 4096; + const int d_head = 128; + const int n_head = hidden_size/d_head; + const int num_query = 96; + + struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b); + Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head)); + struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b); + struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b); + // permute + Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size); + Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); + Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size); + K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size); + K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); + K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size); + V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size); + V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); + V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size); + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + KQ = ggml_soft_max_inplace(ctx0, KQ); + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ); + KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size); + KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size); + + embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b); + } + { // layernorm + embeddings = ggml_norm(ctx0, embeddings, eps); + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_post_w), model.mm_model_ln_post_b); + } + embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings); + } + else { + GGML_ASSERT(false); + } + } + + // build the graph + ggml_build_forward_expand(gf, embeddings); + + ggml_free(ctx0); + + return gf; +} + +// read and create ggml_context containing the tensors and their data +struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1, std::pair load_image_size = {448, 448}) { + struct ggml_context * meta = NULL; + + struct gguf_init_params params = { + /*.no_alloc = */ false, + /*.ctx = */ &meta, + }; + + struct gguf_context * ctx = gguf_init_from_file(fname, params); + if (!ctx) { + throw std::runtime_error(format("%s: failed to load CLIP model from %s. Does this file exist?\n", __func__, fname)); + } + + if (verbosity >= 1) { + const int n_tensors = gguf_get_n_tensors(ctx); + const int n_kv = gguf_get_n_kv(ctx); + const int ftype = get_u32(ctx, KEY_FTYPE); + const std::string ftype_str = get_ftype(ftype); + const int idx_desc = get_key_idx(ctx, KEY_DESCRIPTION); + const std::string description = gguf_get_val_str(ctx, idx_desc); + const int idx_name = gguf_find_key(ctx, KEY_NAME); + if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug + const std::string name = gguf_get_val_str(ctx, idx_name); + LOG_TEE("%s: model name: %s\n", __func__, name.c_str()); + } + LOG_TEE("%s: description: %s\n", __func__, description.c_str()); + LOG_TEE("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx)); + LOG_TEE("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx)); + LOG_TEE("%s: n_tensors: %d\n", __func__, n_tensors); + LOG_TEE("%s: n_kv: %d\n", __func__, n_kv); + LOG_TEE("%s: ftype: %s\n", __func__, ftype_str.c_str()); + LOG_TEE("\n"); + } + const int n_tensors = gguf_get_n_tensors(ctx); + + // kv + const int n_kv = gguf_get_n_kv(ctx); + LOG_TEE("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n", + __func__, n_kv, n_tensors, fname); + { + std::map n_type; + + for (int i = 0; i < n_tensors; i++) { + enum ggml_type type = gguf_get_tensor_type(ctx, i); + + n_type[type]++; + } + + LOG_TEE("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); + for (int i = 0; i < n_kv; i++) { + const char * name = gguf_get_key(ctx, i); + const enum gguf_type type = gguf_get_kv_type(ctx, i); + const std::string type_name = + type == GGUF_TYPE_ARRAY + ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx, i)), gguf_get_arr_n(ctx, i)) + : gguf_type_name(type); + + std::string value = gguf_kv_to_str(ctx, i); + const size_t MAX_VALUE_LEN = 40; + if (value.size() > MAX_VALUE_LEN) { + value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str()); + } + replace_all(value, "\n", "\\n"); + + LOG_TEE("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str()); + } + + // print type counts + for (auto & kv : n_type) { + if (kv.second == 0) { + continue; + } + + LOG_TEE("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second); + } + } + + // data + size_t model_size = 0; + { + for (int i = 0; i < n_tensors; ++i) { + const char * name = gguf_get_tensor_name(ctx, i); + const size_t offset = gguf_get_tensor_offset(ctx, i); + enum ggml_type type = gguf_get_tensor_type(ctx, i); + struct ggml_tensor * cur = ggml_get_tensor(meta, name); + size_t tensor_size = ggml_nbytes(cur); + model_size += tensor_size; + if (verbosity >= 3) { + LOG_TEE("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n", + __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type)); + } + } + } + + clip_ctx * new_clip = new clip_ctx; + + // update projector type + { + int idx = gguf_find_key(ctx, KEY_PROJ_TYPE); + if (idx != -1) { + const std::string proj_type = gguf_get_val_str(ctx, idx); + new_clip->proj_type = clip_projector_type_from_string(proj_type); + } else { + new_clip->proj_type = PROJECTOR_TYPE_MLP; + } + + if (new_clip->proj_type == PROJECTOR_TYPE_MLP) { + if (gguf_find_tensor(ctx, format(TN_LLAVA_PROJ, 3, "weight").c_str()) != -1) { + new_clip->proj_type = PROJECTOR_TYPE_MLP_NORM; + } + } + } + +#ifdef GGML_USE_CUDA + new_clip->backend = ggml_backend_cuda_init(0); + LOG_TEE("%s: CLIP using CUDA backend\n", __func__); +#endif + +#ifdef GGML_USE_METAL + new_clip->backend = ggml_backend_metal_init(); + LOG_TEE("%s: CLIP using Metal backend\n", __func__); +#endif + + + if (!new_clip->backend) { + new_clip->backend = ggml_backend_cpu_init(); + LOG_TEE("%s: CLIP using CPU backend\n", __func__); + } + + // model size and capabilities + { + int idx = get_key_idx(ctx, KEY_HAS_TEXT_ENC); + new_clip->has_text_encoder = gguf_get_val_bool(ctx, idx); + + idx = get_key_idx(ctx, KEY_HAS_VIS_ENC); + new_clip->has_vision_encoder = gguf_get_val_bool(ctx, idx); + + idx = gguf_find_key(ctx, KEY_HAS_LLAVA_PROJ); + if (idx != -1) { + new_clip->has_minicpmv_projector = gguf_get_val_bool(ctx, idx); + } + + GGML_ASSERT(new_clip->has_minicpmv_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search + GGML_ASSERT(new_clip->has_vision_encoder); + GGML_ASSERT(!new_clip->has_text_encoder); + + idx = get_key_idx(ctx, KEY_USE_GELU); + new_clip->use_gelu = gguf_get_val_bool(ctx, idx); + + if (verbosity >= 1) { + LOG_TEE("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder); + LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder); + LOG_TEE("%s: llava_projector: %d\n", __func__, new_clip->has_minicpmv_projector); + LOG_TEE("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0); + LOG_TEE("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0); + } + } + + LOG_TEE("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors); + + // load tensors + { + std::vector read_buf; + struct ggml_init_params params = { + /*.mem_size =*/ (n_tensors + 1) * ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + + new_clip->ctx_data = ggml_init(params); + if (!new_clip->ctx_data) { + LOG_TEE("%s: ggml_init() failed\n", __func__); + clip_free(new_clip); + gguf_free(ctx); + return nullptr; + } + + auto fin = std::ifstream(fname, std::ios::binary); + if (!fin) { + LOG_TEE("cannot open model file for loading tensors\n"); + clip_free(new_clip); + gguf_free(ctx); + return nullptr; + } + + // add tensors to context + for (int i = 0; i < n_tensors; ++i) { + const char * name = gguf_get_tensor_name(ctx, i); + struct ggml_tensor * t = ggml_get_tensor(meta, name); + struct ggml_tensor * cur = ggml_dup_tensor(new_clip->ctx_data, t); + ggml_set_name(cur, name); + } + + // alloc memory and offload data + new_clip->params_buffer = ggml_backend_alloc_ctx_tensors(new_clip->ctx_data, new_clip->backend); + for (int i = 0; i < n_tensors; ++i) { + const char * name = gguf_get_tensor_name(ctx, i); + struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx_data, name); + const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i); + fin.seekg(offset, std::ios::beg); + if (!fin) { + LOG_TEE("%s: failed to seek for tensor %s\n", __func__, name); + clip_free(new_clip); + gguf_free(ctx); + return nullptr; + } + int num_bytes = ggml_nbytes(cur); + if (ggml_backend_buffer_is_host(new_clip->params_buffer)) { + // for the CPU and Metal backend, we can read directly into the tensor + fin.read(reinterpret_cast(cur->data), num_bytes); + } else { + // read into a temporary buffer first, then copy to device memory + read_buf.resize(num_bytes); + fin.read(reinterpret_cast(read_buf.data()), num_bytes); + ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); + } + } + fin.close(); + } + + // vision model + if (new_clip->has_vision_encoder) { + // load vision model + auto & vision_model = new_clip->vision_model; + auto & hparams = vision_model.hparams; + hparams.hidden_size = get_u32(ctx, format(KEY_N_EMBD, "vision")); + hparams.n_head = get_u32(ctx, format(KEY_N_HEAD, "vision")); + hparams.n_intermediate = get_u32(ctx, format(KEY_N_FF, "vision")); + hparams.n_layer = get_u32(ctx, format(KEY_N_BLOCK, "vision")); + hparams.image_size = get_u32(ctx, KEY_IMAGE_SIZE); + hparams.patch_size = get_u32(ctx, KEY_PATCH_SIZE); + hparams.projection_dim = get_u32(ctx, format(KEY_PROJ_DIM, "vision")); + hparams.eps = get_f32(ctx, format(KEY_LAYER_NORM_EPS, "vision")); + + try { + int idx = get_key_idx(ctx, KEY_IMAGE_GRID_PINPOINTS); + int n = gguf_get_arr_n(ctx, idx); + const int32_t * pinpoints = (const int32_t *)gguf_get_arr_data(ctx, idx); + for (int i = 0; i < 32 && i < n && pinpoints[i] != 0; ++i) { + hparams.image_grid_pinpoints[i] = pinpoints[i]; + } + if (n < 32) + hparams.image_grid_pinpoints[n] = 0; + } catch (std::runtime_error & e) { + hparams.image_grid_pinpoints[0]=0; + } + + try { + int idx = get_key_idx(ctx, KEY_MM_PATCH_MERGE_TYPE); + strcpy(hparams.mm_patch_merge_type, gguf_get_val_str(ctx, idx)); + } catch (std::runtime_error & e) { + strcpy(hparams.mm_patch_merge_type, "flat"); + } + + try { + hparams.image_crop_resolution = get_u32(ctx, KEY_IMAGE_CROP_RESOLUTION); // llava-1.6 + } catch(const std::exception& e) { + hparams.image_crop_resolution = hparams.image_size; + } + + int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN); + int idx_std = get_key_idx(ctx, KEY_IMAGE_STD); + + const float * mean_data = (const float *)gguf_get_arr_data(ctx, idx_mean); + const float * std_data = (const float *)gguf_get_arr_data(ctx, idx_std); + + for (int i = 0; i < 3; ++i) { + new_clip->image_mean[i] = mean_data[i]; + new_clip->image_std[i] = std_data[i]; + } + + if (verbosity >= 2) { + LOG_TEE("\n%s: vision model hparams\n", __func__); + LOG_TEE("image_size %d\n", hparams.image_size); + LOG_TEE("patch_size %d\n", hparams.patch_size); + LOG_TEE("v_hidden_size %d\n", hparams.hidden_size); + LOG_TEE("v_n_intermediate %d\n", hparams.n_intermediate); + LOG_TEE("v_projection_dim %d\n", hparams.projection_dim); + LOG_TEE("v_n_head %d\n", hparams.n_head); + LOG_TEE("v_n_layer %d\n", hparams.n_layer); + LOG_TEE("v_eps %f\n", hparams.eps); + LOG_TEE("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]); + LOG_TEE("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]); + LOG_TEE("v_image_grid_pinpoints: "); + for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) { + LOG_TEE("%d ", hparams.image_grid_pinpoints[i]); + } + LOG_TEE("\n"); + LOG_TEE("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type); + + } + + try { + vision_model.patch_embeddings_w = get_tensor(new_clip->ctx_data, format(TN_PATCH_EMBD, "weight")); + vision_model.patch_embeddings_b = get_tensor(new_clip->ctx_data, format(TN_PATCH_EMBD, "bias")); + // vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD); + vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v")); + // vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight")); + // vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias")); + vision_model.post_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "weight")); + vision_model.post_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "bias")); + } catch(const std::exception& e) { + LOG_TEE("%s: failed to load vision model tensors\n", __func__); + } + + // LLaVA projection + if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) { + vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight")); + vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias")); + try { + // Yi-type llava + vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "weight")); + vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "bias")); + } catch (std::runtime_error & e) { } + try { + // missing in Yi-type llava + vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight")); + vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias")); + } catch (std::runtime_error & e) { } + try { + // Yi-type llava + vision_model.mm_3_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "weight")); + vision_model.mm_3_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "bias")); + } catch (std::runtime_error & e) { } + try { + // Yi-type llava + vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight")); + vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias")); + } catch (std::runtime_error & e) { } + try { + vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE); + // LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__); + } catch (std::runtime_error & e) { } + } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) { + // MobileVLM projection + vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight")); + vision_model.mm_model_mlp_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "bias")); + vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "weight")); + vision_model.mm_model_mlp_3_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "bias")); + vision_model.mm_model_block_1_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight")); + vision_model.mm_model_block_1_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight")); + vision_model.mm_model_block_1_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias")); + vision_model.mm_model_block_1_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight")); + vision_model.mm_model_block_1_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias")); + vision_model.mm_model_block_1_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight")); + vision_model.mm_model_block_1_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias")); + vision_model.mm_model_block_1_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight")); + vision_model.mm_model_block_1_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight")); + vision_model.mm_model_block_1_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias")); + vision_model.mm_model_block_2_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight")); + vision_model.mm_model_block_2_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight")); + vision_model.mm_model_block_2_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias")); + vision_model.mm_model_block_2_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight")); + vision_model.mm_model_block_2_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias")); + vision_model.mm_model_block_2_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight")); + vision_model.mm_model_block_2_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias")); + vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight")); + vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight")); + vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias")); + } + else if (new_clip->proj_type == PROJECTOR_TYPE_LDPV2) + { + // MobilVLM_V2 projection + vision_model.mm_model_mlp_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 0, "weight")); + vision_model.mm_model_mlp_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 0, "bias")); + vision_model.mm_model_mlp_2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 2, "weight")); + vision_model.mm_model_mlp_2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 2, "bias")); + vision_model.mm_model_peg_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "weight")); + vision_model.mm_model_peg_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "bias")); + } + else if (new_clip->proj_type == PROJECTOR_TYPE_RESAMPLER) { + // vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD); + vision_model.mm_model_pos_embed_k = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD_K); + vision_model.mm_model_query = get_tensor(new_clip->ctx_data, TN_MINICPMV_QUERY); + vision_model.mm_model_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_PROJ); + vision_model.mm_model_kv_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_KV_PROJ); + vision_model.mm_model_attn_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "weight")); + vision_model.mm_model_attn_k_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "weight")); + vision_model.mm_model_attn_v_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "weight")); + vision_model.mm_model_attn_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "bias")); + vision_model.mm_model_attn_k_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "bias")); + vision_model.mm_model_attn_v_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "bias")); + vision_model.mm_model_attn_o_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "weight")); + vision_model.mm_model_attn_o_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "bias")); + vision_model.mm_model_ln_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "weight")); + vision_model.mm_model_ln_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "bias")); + vision_model.mm_model_ln_kv_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "weight")); + vision_model.mm_model_ln_kv_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "bias")); + vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight")); + vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias")); + } + else { + std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type]; + throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); + } + + vision_model.layers.resize(hparams.n_layer); + + for (int il = 0; il < hparams.n_layer; ++il) { + auto & layer = vision_model.layers[il]; + layer.k_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_K, "v", il, "weight")); + layer.q_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_Q, "v", il, "weight")); + layer.v_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_V, "v", il, "weight")); + layer.o_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_OUTPUT, "v", il, "weight")); + layer.ln_1_w = get_tensor(new_clip->ctx_data, format(TN_LN_1, "v", il, "weight")); + layer.ln_2_w = get_tensor(new_clip->ctx_data, format(TN_LN_2, "v", il, "weight")); + layer.ff_i_w = get_tensor(new_clip->ctx_data, format(TN_FFN_DOWN, "v", il, "weight")); + layer.ff_o_w = get_tensor(new_clip->ctx_data, format(TN_FFN_UP, "v", il, "weight")); + layer.k_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_K, "v", il, "bias")); + layer.q_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_Q, "v", il, "bias")); + layer.v_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_V, "v", il, "bias")); + layer.o_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_OUTPUT, "v", il, "bias")); + layer.ln_1_b = get_tensor(new_clip->ctx_data, format(TN_LN_1, "v", il, "bias")); + layer.ln_2_b = get_tensor(new_clip->ctx_data, format(TN_LN_2, "v", il, "bias")); + layer.ff_i_b = get_tensor(new_clip->ctx_data, format(TN_FFN_DOWN, "v", il, "bias")); + layer.ff_o_b = get_tensor(new_clip->ctx_data, format(TN_FFN_UP, "v", il, "bias")); + } + } + + ggml_free(meta); + + new_clip->ctx_gguf = ctx; + + // measure mem requirement and allocate + { + // todo + new_clip->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead()); + new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend)); + clip_image_f32_batch batch; + batch.size = 1; + ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, load_image_size); + ggml_gallocr_reserve(new_clip->compute_alloc, gf); + size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0); + LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0); + } + + return new_clip; +} + +struct clip_image_u8 * clip_image_u8_init() { + return new clip_image_u8(); +} + +struct clip_image_f32 * clip_image_f32_init() { + return new clip_image_f32(); +} + +void clip_image_u8_free(struct clip_image_u8 * img) { delete img; } +void clip_image_f32_free(struct clip_image_f32 * img) { delete img; } +void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) { + if (batch->size > 0) { + delete[] batch->data; + batch->size = 0; + } +} +void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) { + if (batch->size > 0) { + delete[] batch->data; + batch->size = 0; + } +} + +static void build_clip_img_from_data(const stbi_uc * data, int nx, int ny, clip_image_u8 * img) { + img->nx = nx; + img->ny = ny; + img->buf.resize(3 * nx * ny); + memcpy(img->buf.data(), data, img->buf.size()); +} + +bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) { + int nx, ny, nc; + auto * data = stbi_load(fname, &nx, &ny, &nc, 3); + if (!data) { + LOG_TEE("%s: failed to load image '%s'\n", __func__, fname); + return false; + } + build_clip_img_from_data(data, nx, ny, img); + stbi_image_free(data); + return true; +} + +bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) { + int nx, ny, nc; + auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3); + if (!data) { + LOG_TEE("%s: failed to decode image bytes\n", __func__); + return false; + } + build_clip_img_from_data(data, nx, ny, img); + stbi_image_free(data); + return true; +} + +static void normalize_image_u8_to_f32(struct clip_ctx * ctx, const clip_image_u8* src, clip_image_f32* dst) { + dst->nx = src->nx; + dst->ny = src->ny; + dst->buf.resize(src->buf.size()); + const auto & m3 = ctx->image_mean; + const auto & s3 = ctx->image_std; + + for (size_t i = 0; i < src->buf.size(); ++i) { + int c = i % 3; // rgb + dst->buf[i] = (static_cast(src->buf[i]) / 255.0f - m3[c]) / s3[c]; + } +} + +ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) { + return ctx->vision_model.image_newline; +} + +void clip_free(clip_ctx * ctx) { + ggml_free(ctx->ctx_data); + gguf_free(ctx->ctx_gguf); + + ggml_backend_buffer_free(ctx->params_buffer); + ggml_backend_free(ctx->backend); + ggml_gallocr_free(ctx->compute_alloc); + delete ctx; +} + +size_t clip_embd_nbytes(const struct clip_ctx * ctx) { + return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float); +} + +int32_t clip_image_size(const struct clip_ctx * ctx) { + return ctx->vision_model.hparams.image_size; +} + +int32_t clip_patch_size(const struct clip_ctx * ctx) { + return ctx->vision_model.hparams.patch_size; +} + +int32_t clip_hidden_size(const struct clip_ctx * ctx) { + return ctx->vision_model.hparams.hidden_size; +} + +const char * clip_patch_merge_type(const struct clip_ctx * ctx) { + return ctx->vision_model.hparams.mm_patch_merge_type; +} + +const int32_t * clip_image_grid(const struct clip_ctx * ctx) { + return ctx->vision_model.hparams.image_grid_pinpoints; +} + +int clip_n_patches(const struct clip_ctx * ctx) { + const auto & params = ctx->vision_model.hparams; + + int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size); + + if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) { + n_patches /= 4; + } else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) { + n_patches = 96; + } + + return n_patches; +} + +std::vector>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector>& pos) { + assert(embed_dim % 2 == 0); + int H = pos.size(); + int W = pos[0].size(); + + std::vector omega(embed_dim / 2); + for (int i = 0; i < embed_dim / 2; ++i) { + omega[i] = 1.0 / pow(10000.0, static_cast(i) / (embed_dim / 2)); + } + + std::vector>> emb(H, std::vector>(W, std::vector(embed_dim))); + for (int h = 0; h < H; ++h) { + for (int w = 0; w < W; ++w) { + for (int d = 0; d < embed_dim / 2; ++d) { + float out_value = pos[h][w] * omega[d]; + emb[h][w][d] = sin(out_value); + emb[h][w][d + embed_dim / 2] = cos(out_value); + } + } + } + + return emb; +} + +std::vector>> get_2d_sincos_pos_embed_from_grid(int embed_dim, const std::vector>>& grid) { + assert(embed_dim % 2 == 0); + std::vector>> emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[0]); // (H, W, D/2) + std::vector>> emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[1]); // (H, W, D/2) + + int H = emb_h.size(); + int W = emb_h[0].size(); + std::vector>> emb(H, std::vector>(W, std::vector(embed_dim))); + + for (int h = 0; h < H; ++h) { + for (int w = 0; w < W; ++w) { + for (int d = 0; d < embed_dim / 2; ++d) { + emb[h][w][d] = emb_h[h][w][d]; + emb[h][w][d + embed_dim / 2] = emb_w[h][w][d]; + } + } + } + return emb; +} + +std::vector> get_2d_sincos_pos_embed(int embed_dim, const std::pair image_size) { + int grid_h_size = image_size.first; + int grid_w_size = image_size.second; + + std::vector grid_h(grid_h_size); + std::vector grid_w(grid_w_size); + + for (int i = 0; i < grid_h_size; ++i) { + grid_h[i] = static_cast(i); + } + for (int i = 0; i < grid_w_size; ++i) { + grid_w[i] = static_cast(i); + } + + std::vector> grid(grid_h_size, std::vector(grid_w_size)); + for (int h = 0; h < grid_h_size; ++h) { + for (int w = 0; w < grid_w_size; ++w) { + grid[h][w] = grid_w[w]; + } + } + std::vector>> grid_2d = {grid, grid}; + for (int h = 0; h < grid_h_size; ++h) { + for (int w = 0; w < grid_w_size; ++w) { + grid_2d[0][h][w] = grid_h[h]; + grid_2d[1][h][w] = grid_w[w]; + } + } + + std::vector>> pos_embed_3d = get_2d_sincos_pos_embed_from_grid(embed_dim, grid_2d); + + int H = image_size.first; + int W = image_size.second; + std::vector> pos_embed_2d(H * W, std::vector(embed_dim)); + for (int h = 0; h < H; ++h) { + for (int w = 0; w < W; ++w) { + pos_embed_2d[w * H + h] = pos_embed_3d[h][w]; + } + } + + return pos_embed_2d; +} + +bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec, std::pair load_image_size = {448, 448}) { + if (!ctx->has_vision_encoder) { + LOG_TEE("This gguf file seems to have no vision encoder\n"); + return false; + } + + clip_image_f32_batch imgs{}; + imgs.size = 1; + imgs.data = img; + return clip_image_batch_encode(ctx, n_threads, &imgs, vec, load_image_size); +} + +bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec, std::pair load_image_size = {448, 448}) { + if (!ctx->has_vision_encoder) { + LOG_TEE("This gguf file seems to have no vision encoder\n"); + return false; + } + + int batch_size = imgs->size; + if (ctx->has_minicpmv_projector) { + GGML_ASSERT(batch_size == 1); // TODO: support multiple images + } + + // build the inference graph + ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, load_image_size); + ggml_gallocr_alloc_graph(ctx->compute_alloc, gf); + + // set inputs + const auto & model = ctx->vision_model; + const auto & hparams = model.hparams; + + const int image_size_width = load_image_size.first; + const int image_size_height = load_image_size.second; + const int patch_size = hparams.patch_size; + const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); + const int num_positions = num_patches; + + { + struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw"); + float * data = (float *)malloc(ggml_nbytes(inp_raw)); + + for (size_t i = 0; i < imgs->size; i++) { + const int nx = imgs->data[i].nx; + const int ny = imgs->data[i].ny; + // GGML_ASSERT(nx == image_size && ny == image_size); + + const int n = nx * ny; + + for (int b = 0; b < batch_size; b++) { + for (int k = 0; k < 3; k++) { + for (int y = 0; y < ny; y++) { + for (int x = 0; x < nx; x++) { + data[(b * 3 * n) + k * n + y * nx + x] = imgs->data[b].buf[3 * (y * nx + x) + k]; + } + } + } + } + } + ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw)); + free(data); + } + + { + // inspired from siglip: + // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit + // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316 + struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); + + int* positions_data = (int*)malloc(ggml_nbytes(positions)); + for (int i = 0; i < num_positions; i++) { + positions_data[i] = std::floor(70.0*i/num_positions); + } + ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); + free(positions_data); + } + + { + // inspired from resampler of Qwen-VL: + // -> https://huggingface.co/Qwen/Qwen-VL/tree/main + // -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23 + struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed"); + int pos_w = image_size_width/patch_size; + int pos_h = image_size_height/patch_size; + int embed_dim = 4096; + auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h)); + + float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed)); + for(int i=0;ibackend)) { + ggml_backend_cpu_set_n_threads(ctx->backend, n_threads); + } + +#ifdef GGML_USE_METAL + if (ggml_backend_is_metal(ctx->backend)) { + ggml_backend_metal_set_n_cb(ctx->backend, n_threads); + } +#endif + + ggml_backend_graph_compute(ctx->backend, gf); + + // the last node is the embedding tensor + struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1]; + + // copy the embeddings to the location passed by the user + ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings)); + + return true; +} + +bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype) { + ggml_type type = GGML_TYPE_Q4_1; + + assert(itype < GGML_TYPE_COUNT); + type = static_cast(itype); + + auto * ctx_clip = clip_model_load(fname_inp, 2); + + const auto & ctx_src = ctx_clip->ctx_gguf; + const auto & ctx_data = ctx_clip->ctx_data; + + auto * ctx_out = gguf_init_empty(); + gguf_set_kv(ctx_out, ctx_src); + gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); + gguf_set_val_u32(ctx_out, "general.file_type", itype); + + auto fout = std::ofstream(fname_out, std::ios::binary); + + const int n_tensors = gguf_get_n_tensors(ctx_src); + + for (int i = 0; i < n_tensors; ++i) { + const char * name = gguf_get_tensor_name(ctx_src, i); + struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name); + gguf_add_tensor(ctx_out, cur); + } + + const size_t meta_size = gguf_get_meta_size(ctx_out); + for (size_t i = 0; i < meta_size; ++i) { + fout.put(0); + } + + // regexes of tensor names to be quantized + const std::vector k_names = { + ".*weight", + }; + + std::vector work(512); + std::vector conv_buf(512); + size_t total_size_org = 0; + size_t total_size_new = 0; + + for (int i = 0; i < n_tensors; ++i) { + const std::string name = gguf_get_tensor_name(ctx_src, i); + struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name.c_str()); + + enum ggml_type new_type; + void * new_data; + size_t new_size; + + bool quantize = false; + for (const auto & s : k_names) { + if (std::regex_match(name, std::regex(s))) { + quantize = true; + break; + } + } + + // quantize only 2D tensors + quantize &= (ggml_n_dims(cur) == 2); + + if (quantize) { + new_type = type; + if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) { + new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type + // LOG_TEE("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type)); + } + const size_t n_elms = ggml_nelements(cur); + float * f32_data; + + switch (cur->type) { + case GGML_TYPE_F32: + f32_data = (float *)cur->data; + break; + case GGML_TYPE_F16: + if (conv_buf.size() < n_elms) { + conv_buf.resize(n_elms); + } + for (size_t j = 0; j < n_elms; ++j) { + conv_buf[j] = ggml_fp16_to_fp32(((ggml_fp16_t *)cur->data)[j]); + } + f32_data = (float *)conv_buf.data(); + break; + default: + LOG_TEE("Please use an input file in f32 or f16\n"); + gguf_free(ctx_out); + return false; + } + + if (work.size() < n_elms * 4) { + work.resize(n_elms * 4); + } + new_data = work.data(); + + new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, n_elms/cur->ne[0], cur->ne[0], nullptr); + } else { + new_type = cur->type; + new_data = cur->data; + new_size = ggml_nbytes(cur); + } + const size_t orig_size = ggml_nbytes(cur); + total_size_org += orig_size; + total_size_new += new_size; + gguf_set_tensor_type(ctx_out, name.c_str(), new_type); + gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size); + fout.write((const char *)new_data, new_size); + size_t pad = GGML_PAD(new_size, gguf_get_alignment(ctx_out)) - new_size; + for (size_t j = 0; j < pad; ++j) { + fout.put(0); + } + + LOG_TEE("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize, + orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); + } + + // go back to beginning of file and write the updated metadata + fout.seekp(0, std::ios::beg); + std::vector meta(meta_size); + gguf_get_meta_data(ctx_out, meta.data()); + fout.write((const char *)meta.data(), meta_size); + + fout.close(); + + clip_free(ctx_clip); + gguf_free(ctx_out); + + { + LOG_TEE("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0); + LOG_TEE("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0); + } + + return true; +} + +int clip_n_mmproj_embd(const struct clip_ctx * ctx) { + if (ctx->proj_type == PROJECTOR_TYPE_LDP) { + return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0]; + } + if (ctx->proj_type == PROJECTOR_TYPE_LDPV2) { + return ctx->vision_model.mm_model_peg_0_b->ne[0]; + } + if (ctx->proj_type == PROJECTOR_TYPE_MLP) { + return ctx->vision_model.mm_2_b->ne[0]; + } + if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { + return ctx->vision_model.mm_3_b->ne[0]; + } + if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) { + return 4096; + } + + std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type]; + throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); +} diff --git a/examples/minicpmv/clip.h b/examples/minicpmv/clip.h new file mode 100644 index 000000000..aae4c7c3a --- /dev/null +++ b/examples/minicpmv/clip.h @@ -0,0 +1,85 @@ +#ifndef CLIP_H +#define CLIP_H + +#include +#include +#include + +#ifdef LLAMA_SHARED +# if defined(_WIN32) && !defined(__MINGW32__) +# ifdef LLAMA_BUILD +# define CLIP_API __declspec(dllexport) +# else +# define CLIP_API __declspec(dllimport) +# endif +# else +# define CLIP_API __attribute__ ((visibility ("default"))) +# endif +#else +# define CLIP_API +#endif + +struct clip_ctx; + +#ifdef __cplusplus +extern "C" { +#endif + +struct clip_ctx; + +struct clip_image_u8_batch { + struct clip_image_u8 * data; + size_t size; +}; + +struct clip_image_f32_batch { + struct clip_image_f32 * data; + size_t size; +}; + +CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity, std::pair load_image_size); +CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity); + +CLIP_API void clip_free(struct clip_ctx * ctx); + +CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx); + +CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx); +CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx); +CLIP_API int32_t clip_hidden_size(const struct clip_ctx * ctx); + +// TODO: should be enum, not string +CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx); + +CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx); + +CLIP_API int clip_n_patches (const struct clip_ctx * ctx); +CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx); + +CLIP_API struct clip_image_u8 * clip_image_u8_init (); +CLIP_API struct clip_image_f32 * clip_image_f32_init(); + +CLIP_API void clip_image_u8_free (struct clip_image_u8 * img); +CLIP_API void clip_image_f32_free(struct clip_image_f32 * img); +CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch * batch); +CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch); + +CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img); + +/** interpret bytes as an image file with length bytes_length, and use the result to populate img */ +CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img); + +static void normalize_image_u8_to_f32(struct clip_ctx * ctx, const clip_image_u8* src, clip_image_f32* dst); + +CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx); + +CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec, std::pair load_image_size); +CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec, std::pair load_image_size); + +CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype); + +#ifdef __cplusplus +} +#endif + +#endif // CLIP_H diff --git a/examples/minicpmv/minicpmv-cli.cpp b/examples/minicpmv/minicpmv-cli.cpp new file mode 100644 index 000000000..281b8d097 --- /dev/null +++ b/examples/minicpmv/minicpmv-cli.cpp @@ -0,0 +1,158 @@ +#include "ggml.h" +#include "log.h" +#include "common.h" +#include "clip.h" +#include "minicpmv.h" +#include "minicpmv_wrapper.h" +#include "llama.h" + +#include +#include +#include + +static void show_additional_info(int /*argc*/, char ** argv) { + LOG_TEE("\n example usage: %s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); + LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n"); +} + +static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) { + (void) level; + (void) user_data; + LOG_TEE("%s", text); +} + +struct minicpmv_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){ + auto image_embed_slices = minicpmv_image_embed(params, fname); + if (!image_embed_slices[0][0]) { + std::cerr << "error: failed to load image " << fname << ". Terminating\n\n"; + return NULL; + } + + // process the prompt + if (params->prompt.empty() && params->interactive == false) { + LOG_TEE("prompt should be given or interactive mode should be on"); + return NULL; + } + + auto model = llava_init(params); + if (model == NULL) { + fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__); + return NULL; + } + const int64_t t_llava_init_start_us = ggml_time_us(); + auto ctx_llava = llava_init_context(params, model); + + const int64_t t_llava_init_end_us = ggml_time_us(); + float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0; + LOG_TEE("\n%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms); + + const int64_t t_process_image_start_us = ggml_time_us(); + process_image(ctx_llava, image_embed_slices, params, n_past); + const int64_t t_process_image_end_us = ggml_time_us(); + float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0; + LOG_TEE("\n%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms); + + llava_image_embed_free_slice(image_embed_slices); + return ctx_llava; +} + +struct llama_sampling_context * llama_init(struct minicpmv_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){ + std::string user_prompt = prompt; + if (!is_first) user_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + prompt; + const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict; + + eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false); + eval_string(ctx_llava->ctx_llama, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false); + // generate the response + + LOG_TEE("\n"); + + struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams); + return ctx_sampling; +} + +const char * llama_loop(struct minicpmv_context * ctx_llava,struct llama_sampling_context * ctx_sampling, int &n_past){ + + const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past); + return tmp; +} + +int main(int argc, char ** argv) { + ggml_time_init(); + + gpt_params params; + + if (!gpt_params_parse(argc, argv, params)) { + show_additional_info(argc, argv); + return 1; + } + +#ifndef LOG_DISABLE_LOGS + log_set_target(log_filename_generator("llava", "log")); + LOG_TEE("Log start\n"); + log_dump_cmdline(argc, argv); + llama_log_set(llama_log_callback_logTee, nullptr); +#endif // LOG_DISABLE_LOGS + + if (params.mmproj.empty() || (params.image.empty())) { + gpt_params_print_usage(argc, argv, params); + show_additional_info(argc, argv); + return 1; + } + + for (auto & image : params.image) { + int n_past = 0; + auto ctx_llava = minicpmv_init(¶ms, image, n_past); + + if (!params.prompt.empty()) { + LOG_TEE("%s\n", params.prompt.c_str()); + LOG_TEE(""); + auto ctx_sampling = llama_init(ctx_llava, ¶ms, params.prompt.c_str(), n_past, true); + const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict; + std::string response = ""; + bool have_tmp = false; + for (int i = 0; i < max_tgt_len; i++) { + auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past); + response += tmp; + if (strcmp(tmp, "") == 0){ + if(!have_tmp)continue; + else break; + } + if (strstr(tmp, "###")) break; // Yi-VL behavior + have_tmp = true; + printf("%s", tmp); + if (strstr(response.c_str(), "")) break; // minicpm-v + + fflush(stdout); + } + llama_sampling_free(ctx_sampling); + }else { + while (true) { + LOG_TEE(""); + std::string prompt; + std::getline(std::cin, prompt); + LOG_TEE(""); + auto ctx_sampling = llama_init(ctx_llava, ¶ms, prompt, n_past, true); + const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict; + std::string response = ""; + for (int i = 0; i < max_tgt_len; i++) { + auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past); + response += tmp; + if (strcmp(tmp, "") == 0) break; + if (strstr(tmp, "###")) break; // Yi-VL behavior + printf("%s", tmp);// mistral llava-1.6 + if (strstr(response.c_str(), "")) break; // minicpm-v + fflush(stdout); + } + llama_sampling_free(ctx_sampling); + } + } + printf("\n"); + llama_print_timings(ctx_llava->ctx_llama); + + ctx_llava->model = NULL; + llava_free(ctx_llava); + } + + return 0; +} \ No newline at end of file diff --git a/examples/minicpmv/minicpmv-convert-image-encoder-to-gguf.py b/examples/minicpmv/minicpmv-convert-image-encoder-to-gguf.py new file mode 100644 index 000000000..71af0b4f1 --- /dev/null +++ b/examples/minicpmv/minicpmv-convert-image-encoder-to-gguf.py @@ -0,0 +1,382 @@ +import argparse +import os +import json +import re + +import torch +import numpy as np +from gguf import * +import timm +from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer, Idefics2VisionConfig + +TEXT = "clip.text" +VISION = "clip.vision" + + +def k(raw_key: str, arch: str) -> str: + return raw_key.format(arch=arch) + + +def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_minicpmv: bool) -> bool: + if name in ( + "logit_scale", + "text_model.embeddings.position_ids", + "vision_model.embeddings.position_ids", + ): + return True + + if has_minicpmv and name in ["visual_projection.weight"]: + return True + + if name.startswith("v") and not has_vision: + return True + + if name.startswith("t") and not has_text: + return True + + return False + + +def get_tensor_name(name: str) -> str: + if "projection" in name: + return name + if "mm_projector" in name: + name = name.replace("model.mm_projector", "mm") + name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1) + name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1) + return name + + return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln") + + +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a significant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + bs = ( + list(range(ord("!"), ord("~") + 1)) + + list(range(ord("¡"), ord("¬") + 1)) + + list(range(ord("®"), ord("ÿ") + 1)) + ) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + + +ap = argparse.ArgumentParser() +ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True) +ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16") +ap.add_argument("--text-only", action="store_true", required=False, + help="Save a text-only model. It can't be used to encode images") +ap.add_argument("--vision-only", action="store_true", required=False, + help="Save a vision-only model. It can't be used to encode texts") +ap.add_argument("--clip-model-is-vision", action="store_true", required=False, + help="The clip model is a pure vision model (ShareGPT4V vision extract for example)") +ap.add_argument("--clip-model-is-openclip", action="store_true", required=False, + help="The clip model is from openclip (for ViT-SO400M type))") +ap.add_argument("--minicpmv-projector", help="Path to minicpmv.projector file. If specified, save an image encoder for MiniCPM-V models.") +ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp") +ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None) +# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711 +# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5 +default_image_mean = [0.48145466, 0.4578275, 0.40821073] +default_image_std = [0.26862954, 0.26130258, 0.27577711] +ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None) +ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None) + +# with proper +args = ap.parse_args() + + +if args.text_only and args.vision_only: + print("--text-only and --image-only arguments cannot be specified at the same time.") + exit(1) + +if args.use_f32: + print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.") + +# output in the same directory as the model if output_dir is None +dir_model = args.model_dir + +if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip: + vocab = None + tokens = None +else: + with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f: + vocab = json.load(f) + tokens = [key for key in vocab] + +# possible data types +# ftype == 0 -> float32 +# ftype == 1 -> float16 +# +# map from ftype to string +ftype_str = ["f32", "f16"] + +ftype = 1 +if args.use_f32: + ftype = 0 + +# if args.clip_model_is_vision or args.clip_model_is_openclip: +# model = CLIPVisionModel.from_pretrained(dir_model) +# processor = None +# else: +# model = CLIPModel.from_pretrained(dir_model) +# processor = CLIPProcessor.from_pretrained(dir_model) + +default_vision_config = { + "hidden_size": 1152, + "image_size": 980, + "intermediate_size": 4304, + "model_type": "idefics2", + "num_attention_heads": 16, + "num_hidden_layers": 27, + "patch_size": 14, + } +vision_config = Idefics2VisionConfig(**default_vision_config) +model = Idefics2VisionTransformer(vision_config) + +processor = None +# if model.attn_pool is not None: +# model.attn_pool = torch.nn.Identity() + +# model.blocks = model.blocks[:-1] +model.load_state_dict(torch.load(os.path.join(dir_model, "minicpmv.clip"))) + +fname_middle = None +has_text_encoder = True +has_vision_encoder = True +has_minicpmv_projector = False +if args.text_only: + fname_middle = "text-" + has_vision_encoder = False +elif args.minicpmv_projector is not None: + fname_middle = "mmproj-" + has_text_encoder = False + has_minicpmv_projector = True +elif args.vision_only: + fname_middle = "vision-" + has_text_encoder = False +else: + fname_middle = "" + +output_dir = args.output_dir if args.output_dir is not None else dir_model +os.makedirs(output_dir, exist_ok=True) +output_prefix = os.path.basename(output_dir).replace("ggml_", "") +fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf") +fout = GGUFWriter(path=fname_out, arch="clip") + +fout.add_bool("clip.has_text_encoder", has_text_encoder) +fout.add_bool("clip.has_vision_encoder", has_vision_encoder) +fout.add_bool("clip.has_minicpmv_projector", has_minicpmv_projector) +fout.add_file_type(ftype) +if args.text_only: + fout.add_description("text-only CLIP model") +elif args.vision_only and not has_minicpmv_projector: + fout.add_description("vision-only CLIP model") +elif has_minicpmv_projector: + fout.add_description("image encoder for MiniCPM-V") + # add projector type + fout.add_string("clip.projector_type", "resampler") +else: + fout.add_description("two-tower CLIP model") + +if has_vision_encoder: + # vision_model hparams + fout.add_uint32("clip.vision.image_size", 448) + fout.add_uint32("clip.vision.patch_size", 14) + fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), 1152) + fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), 4304) + fout.add_uint32("clip.vision.projection_dim", 0) + fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), 16) + fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6) + block_count = 26 + fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count) + + if processor is not None: + image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean + image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std + else: + image_mean = args.image_mean if args.image_mean is not None else default_image_mean + image_std = args.image_std if args.image_std is not None else default_image_std + fout.add_array("clip.vision.image_mean", image_mean) + fout.add_array("clip.vision.image_std", image_std) + +use_gelu = True +fout.add_bool("clip.use_gelu", use_gelu) + +def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): + """ + embed_dim: output dimension for each position + pos: a list of positions to be encoded: size (M,) + out: (M, D) + """ + assert embed_dim % 2 == 0 + omega = np.arange(embed_dim // 2, dtype=np.float32) + omega /= embed_dim / 2. + omega = 1. / 10000 ** omega # (D/2,) + + pos = pos.reshape(-1) # (M,) + out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product + + emb_sin = np.sin(out) # (M, D/2) + emb_cos = np.cos(out) # (M, D/2) + + emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) + return emb + +def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): + assert embed_dim % 2 == 0 + + # use half of dimensions to encode grid_h + emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) + emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) + + emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) + return emb + + +# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20 +def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False): + """ + grid_size: int of the grid height and width + return: + pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) + """ + if isinstance(grid_size, int): + grid_h_size, grid_w_size = grid_size, grid_size + else: + grid_h_size, grid_w_size = grid_size[0], grid_size[1] + + grid_h = np.arange(grid_h_size, dtype=np.float32) + grid_w = np.arange(grid_w_size, dtype=np.float32) + grid = np.meshgrid(grid_w, grid_h) # here w goes first + grid = np.stack(grid, axis=0) + + grid = grid.reshape([2, 1, grid_h_size, grid_w_size]) + pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) + if cls_token: + pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0) + return pos_embed + +def _replace_name_resampler(s, v): + if re.match("resampler.pos_embed", s): + return { + s: v, + re.sub("pos_embed", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(4096, (70, 70))), + } + if re.match("resampler.proj", s): + return { + re.sub("proj", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(4096, (70, 70))), + re.sub("proj", "proj.weight", s): v.transpose(-1, -2).contiguous(), + } + if re.match("resampler.attn.in_proj_.*", s): + return { + re.sub("attn.in_proj_", "attn.q.", s): v.chunk(3, dim=0)[0], + re.sub("attn.in_proj_", "attn.k.", s): v.chunk(3, dim=0)[1], + re.sub("attn.in_proj_", "attn.v.", s): v.chunk(3, dim=0)[2], + } + return {s: v} + +if has_minicpmv_projector: + projector = torch.load(args.minicpmv_projector) + new_state_dict = {} + for k, v in projector.items(): + kvs = _replace_name_resampler(k, v) + for nk, nv in kvs.items(): + new_state_dict[nk] = nv + projector = new_state_dict + for name, data in projector.items(): + name = get_tensor_name(name) + data = data.squeeze().numpy() + + n_dims = len(data.shape) + if ftype == 1: + if name[-7:] == ".weight" and n_dims == 2: + print(" Converting to float16") + data = data.astype(np.float16) + ftype_cur = 1 + else: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + else: + if data.dtype != np.float32: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + + fout.add_tensor(name, data) + print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}") + + print("Projector tensors added\n") + +def _replace_name(s, v): + s = "vision_model." + s + if re.match("vision_model.embeddings.position_embedding", s): + v = v.unsqueeze(0) + return {s: v} + + return {s: v} + +state_dict = model.state_dict() +new_state_dict = {} +for k, v in state_dict.items(): + kvs = _replace_name(k, v) + for nk, nv in kvs.items(): + new_state_dict[nk] = nv +state_dict = new_state_dict +for name, data in state_dict.items(): + if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_minicpmv_projector): + # we don't need this + print(f"skipping parameter: {name}") + continue + + name = get_tensor_name(name) + data = data.squeeze().numpy() + + n_dims = len(data.shape) + + # ftype == 0 -> float32, ftype == 1 -> float16 + ftype_cur = 0 + if n_dims == 4: + print(f"tensor {name} is always saved in f16") + data = data.astype(np.float16) + ftype_cur = 1 + elif ftype == 1: + if name[-7:] == ".weight" and n_dims == 2: + print(" Converting to float16") + data = data.astype(np.float16) + ftype_cur = 1 + else: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + else: + if data.dtype != np.float32: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + + print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}") + fout.add_tensor(name, data) + + +fout.write_header_to_file() +fout.write_kv_data_to_file() +fout.write_tensors_to_file() +fout.close() + +print("Done. Output file: " + fname_out) diff --git a/examples/minicpmv/minicpmv-surgery.py b/examples/minicpmv/minicpmv-surgery.py new file mode 100644 index 000000000..51b4d4859 --- /dev/null +++ b/examples/minicpmv/minicpmv-surgery.py @@ -0,0 +1,48 @@ +import argparse +import glob +import os +import torch +from transformers import AutoModel, AutoTokenizer + +ap = argparse.ArgumentParser() +ap.add_argument("-m", "--model", help="Path to MiniCPM-V-2.5 model") +args = ap.parse_args() + +# find the model part that includes the the multimodal projector weights +model = AutoModel.from_pretrained(args.model, trust_remote_code=True, local_files_only=True) +checkpoint = model.state_dict() + +# get a list of mm tensor names +mm_tensors = [k for k, v in checkpoint.items() if k.startswith("resampler")] + +# store these tensors in a new dictionary and torch.save them +projector = {name: checkpoint[name].float() for name in mm_tensors} +torch.save(projector, f"{args.model}/minicpmv.projector") + +clip_tensors = [k for k, v in checkpoint.items() if k.startswith("vpm")] +if len(clip_tensors) > 0: + clip = {name.replace("vpm.", ""): checkpoint[name].float() for name in clip_tensors} + torch.save(clip, f"{args.model}/minicpmv.clip") + + # added tokens should be removed to be able to convert Mistral models + if os.path.exists(f"{args.model}/added_tokens.json"): + with open(f"{args.model}/added_tokens.json", "w") as f: + f.write("{}\n") + +config = model.llm.config +config._name_or_path = "openbmb/MiniCPM-Llama3-V-2.5" +config.auto_map = { + "AutoConfig": "configuration_minicpm.MiniCPMConfig", + "AutoModel": "modeling_minicpm.MiniCPMModel", + "AutoModelForCausalLM": "modeling_minicpm.MiniCPMForCausalLM", + "AutoModelForSeq2SeqLM": "modeling_minicpm.MiniCPMForCausalLM", + "AutoModelForSequenceClassification": "modeling_minicpm.MiniCPMForSequenceClassification" +} +model.llm.save_pretrained(f"{args.model}/model") +tok = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True) +tok.save_pretrained(f"{args.model}/model") +# os.system(f"cp {args.model}/modeling_minicpm.py {args.model}/MiniCPM_l3/modeling_minicpm.py") + +print("Done!") +print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.") +print(f"Also, use {args.model}/minicpmv.projector to prepare a minicpmv-encoder.gguf file.") diff --git a/examples/minicpmv/minicpmv.cpp b/examples/minicpmv/minicpmv.cpp new file mode 100644 index 000000000..9dbe577c9 --- /dev/null +++ b/examples/minicpmv/minicpmv.cpp @@ -0,0 +1,450 @@ +#include "clip.h" +#include "common.h" +#include "llama.h" +#include "minicpmv.h" +#include "base64.hpp" + +#include +#include +#include +#include + +// RGB uint8 image +struct clip_image_u8 { + int nx; + int ny; + + std::vector buf; +}; + +// RGB float32 image (NHWC) +// Memory layout: RGBRGBRGB... +struct clip_image_f32 { + int nx; + int ny; + + std::vector buf; +}; + +struct clip_image_grid_shape { + int first; + int second; +}; + +static bool encode_image_with_clip_uhd(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) { + // std::vector img_res_v; + // format VectN x H x W x RGB (N x 448 x 448 x 3) + clip_image_f32 * img_res_v = clip_image_f32_init(); + std::pair load_image_size; + load_image_size.first = img->nx; + load_image_size.second = img->ny; + normalize_image_u8_to_f32(ctx_clip, img, img_res_v); + + const int64_t t_img_enc_start_us = ggml_time_us(); + + const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip); + LOG_TEE("\n%s: mm_patch_merge_type is %s.\n", __func__, mm_patch_merge_type); + + *n_img_pos = clip_n_patches(ctx_clip); + bool encoded = clip_image_encode(ctx_clip, n_threads, img_res_v, image_embd, load_image_size); // image_embd shape is 96 x 4096 + if (!encoded) { + LOG_TEE("Unable to encode image\n"); + return false; + } + LOG_TEE("%s: image embedding created: %d tokens\n", __func__, *n_img_pos); + + const int64_t t_img_enc_end_us = ggml_time_us(); + float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0; + LOG_TEE("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos); + + return true; +} + +bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip) { + // make sure that the correct mmproj was used, i.e., compare apples to apples + int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama)); + auto n_image_embd = clip_n_mmproj_embd(ctx_clip); + if (n_image_embd != n_llama_embd) { + LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd); + return false; + } + return true; +} + +bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) { + float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); + if (!image_embd) { + LOG_TEE("Unable to allocate memory for image embeddings\n"); + return false; + } + + int n_img_pos; + if (!encode_image_with_clip_uhd(ctx_clip, n_threads, img, image_embd, &n_img_pos)) { + LOG_TEE("%s: cannot encode image, aborting\n", __func__); + free(image_embd); + return false; + } + *image_embd_out = image_embd; + *n_img_pos_out = n_img_pos; + + return true; +} + +bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, int n_batch, int * n_past) { + int n_embd = llama_n_embd(llama_get_model(ctx_llama)); + + for (int i = 0; i < image_embed->n_image_pos; i += n_batch) { + int n_eval = image_embed->n_image_pos - i; + if (n_eval > n_batch) { + n_eval = n_batch; + } + llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, }; + if (llama_decode(ctx_llama, batch)) { + LOG_TEE("%s : failed to eval\n", __func__); + return false; + } + *n_past += n_eval; + } + return true; +} + +int ensure_divide(int length, int patch_size) { + return std::max(static_cast(std::round(static_cast(length) / patch_size) * patch_size), patch_size); +} + +std::pair uhd_find_best_resize(std::pair original_size, int scale_resolution, int patch_size, bool allow_upscale = false) { + int width = original_size.first; + int height = original_size.second; + if ((width * height > scale_resolution * scale_resolution) || allow_upscale) { + float r = static_cast(width) / height; + height = static_cast(scale_resolution / std::sqrt(r)); + width = static_cast(height * r); + } + int best_width = ensure_divide(width, patch_size); + int best_height = ensure_divide(height, patch_size); + return std::make_pair(best_width, best_height); +} + +inline float clip(float x, float lower, float upper) { + return std::max(lower, std::min(x, upper)); +} + +std::pair uhd_get_refine_size(std::pair original_size, std::pair grid, int scale_resolution, int patch_size, bool allow_upscale = false) { + int width, height; + std::tie(width, height) = original_size; + int grid_x, grid_y; + std::tie(grid_x, grid_y) = grid; + + int refine_width = ensure_divide(width, grid_x); + int refine_height = ensure_divide(height, grid_y); + + int grid_width = refine_width / grid_x; + int grid_height = refine_height / grid_y; + + // auto best_grid_size = find_best_resize(std::make_tuple(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); (old line) + auto best_grid_size = uhd_find_best_resize(std::make_pair(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); // (new line) => fixes conversion for make_tuple to make_pair + int best_grid_width, best_grid_height; + std::tie(best_grid_width, best_grid_height) = best_grid_size; + + // std::pair refine_size = std::make_tuple(best_grid_width * grid_x, best_grid_height * grid_y); (old line) + std::pair refine_size = std::make_pair(best_grid_width * grid_x, best_grid_height * grid_y); // (new line) + return refine_size; +} + +static bool bicubic_resize(const clip_image_u8 &img, clip_image_u8 &dst, int target_width, int target_height) { + const int nx = img.nx; + const int ny = img.ny; + + dst.nx = target_width; + dst.ny = target_height; + dst.buf.resize(3 * target_width * target_height); + + float Cc; + float C[5]; + float d0, d2, d3, a0, a1, a2, a3; + int i, j, k, jj; + int x, y; + float dx, dy; + float tx, ty; + + tx = (float)nx / (float)target_width; + ty = (float)ny / (float)target_height; + + // Bicubic interpolation; adapted from ViT.cpp, inspired from : + // -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36 + // -> https://en.wikipedia.org/wiki/Bicubic_interpolation + + for (i = 0; i < target_height; i++) { + for (j = 0; j < target_width; j++) { + x = (int)(tx * j); + y = (int)(ty * i); + + dx = tx * j - x; + dy = ty * i - y; + + for (k = 0; k < 3; k++) { + for (jj = 0; jj <= 3; jj++) { + d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + + a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3; + a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2; + a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3; + + C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx; + + d0 = C[0] - C[1]; + d2 = C[2] - C[1]; + d3 = C[3] - C[1]; + a0 = C[1]; + a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3; + a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2; + a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3; + Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy; + + const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f); + dst.buf[(i * target_width + j) * 3 + k] = float(Cc2); + } + } + } + } + + return true; +} + +// inspired from LLaVA-UHD: +// -> https://arxiv.org/pdf/2403.11703 +// -> https://github.com/thunlp/LLaVA-UHD +// -> https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118 +std::vector> uhd_slice_image(const clip_image_u8 * img, const int max_slice_nums=9, const int scale_resolution=448, const int patch_size=14, const bool never_split=false) { + const std::pair original_size={img->nx,img->ny}; + const int original_width = img->nx; + const int original_height = img->ny; + const float log_ratio = log(1.0*original_width/original_height); // + const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution); + const int multiple = fmin(ceil(ratio), max_slice_nums); + + std::vector> images; + LOG_TEE("%s: multiple %d\n", __func__, multiple); + images.push_back(std::vector()); + + if(multiple <= 1){ + auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size, true); + clip_image_u8 *source_image = clip_image_u8_init(); + bicubic_resize(*img, *source_image, best_size.first, best_size.second); + // source_image = image.resize(best_size, Image.Resampling.BICUBIC) + images[images.size()-1].push_back(source_image); + } + else if(multiple > 1){ + + std::vector candidate_split_grids_nums; + for (int i : {multiple - 1, multiple, multiple + 1}) { + if (i == 1 || i > max_slice_nums) { + continue; + } + candidate_split_grids_nums.push_back(i); + } + + auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size); + clip_image_u8 *source_image = clip_image_u8_init(); + bicubic_resize(*img, *source_image, best_size.first, best_size.second); + // source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC) + images[images.size()-1].push_back(source_image); + + std::vector> candidate_grids; + + for (int split_grids_nums : candidate_split_grids_nums) { + int m = 1; + while (m <= split_grids_nums) { + if (split_grids_nums % m == 0) { + candidate_grids.emplace_back(m, split_grids_nums / m); + } + ++m; + } + } + + std::pair best_grid{1, 1}; + float min_error = std::numeric_limits::infinity(); + + for (const auto& grid : candidate_grids) { + float error = std::abs(log_ratio - std::log(1.0 * grid.first / grid.second)); + if (error < min_error) { + best_grid = grid; + min_error = error; + } + } + LOG_TEE("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second); + + auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true); + clip_image_u8 *refine_image = clip_image_u8_init(); + bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second); + + LOG_TEE("%s: refine_image_size: %d %d; best_grid: %d %d\n", __func__, refine_image->nx, refine_image->ny, best_grid.first, best_grid.second); + + // split_to_patches + int width = refine_image->nx; + int height = refine_image->ny; + int grid_x = int(width / best_grid.first); + int grid_y = int(height / best_grid.second); + for (int patches_i = 0, ic = 0; patches_i < height && ic < best_grid.second; patches_i += grid_y, ic += 1){ + images.push_back(std::vector()); + for(int patches_j = 0, jc = 0; patches_j < width && jc < best_grid.first; patches_j += grid_x, jc += 1){ + clip_image_u8 * patch = clip_image_u8_init(); + patch->nx = grid_x; + patch->ny = grid_y; + patch->buf.resize(3 * patch->nx * patch->ny); + for (int y = patches_i; y < patches_i + grid_y; ++y) { + for (int x = patches_j; x < patches_j + grid_x; ++x) { + const int i = 3 * (y * refine_image->nx + x); + const int j = 3 * ((y-patches_i) * patch->nx + (x-patches_j)); + patch->buf[j] = refine_image->buf[i]; + patch->buf[j+1] = refine_image->buf[i+1]; + patch->buf[j+2] = refine_image->buf[i+2]; + } + } + images[images.size()-1].push_back(patch); + } + } + } + return images; +} + +std::vector> llava_image_embed_make_with_bytes_uhd(struct clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img) { + std::vector> imgs = uhd_slice_image(img); + for (size_t i = 0; i < imgs.size(); ++i){ + for (size_t j = 0; j < imgs[i].size(); ++j) { + LOG_TEE("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny); + } + } + std::vector> results; + + for (size_t i = 0; i < imgs.size(); ++i){ + results.push_back(std::vector()); + for (size_t j = 0; j < imgs[i].size(); ++j) { + float* image_embed = NULL; + int n_image_pos = 0; + bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, imgs[i][j], &image_embed, &n_image_pos); + if (!image_embed_result) { + LOG_TEE("%s: coulnd't embed the image\n", __func__); + return std::vector>(); + } + + auto result = (llava_image_embed*)malloc(sizeof(llava_image_embed)); + result->embed = image_embed; + result->n_image_pos = n_image_pos; + results[i].push_back(result); + } + } + return results; +} + +static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) { + auto file = fopen(path, "rb"); + if (file == NULL) { + LOG_TEE("%s: can't read file %s\n", __func__, path); + return false; + } + + fseek(file, 0, SEEK_END); + auto fileSize = ftell(file); + fseek(file, 0, SEEK_SET); + + auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data + if (buffer == NULL) { + LOG_TEE("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path); + perror("Memory allocation error"); + fclose(file); + return false; + } + errno = 0; + size_t ret = fread(buffer, 1, fileSize, file); // Read the file into the buffer + if (ferror(file)) { + die_fmt("read error: %s", strerror(errno)); + } + if (ret != (size_t) fileSize) { + die("unexpectedly reached end of file"); + } + fclose(file); // Close the file + + *bytesOut = buffer; + *sizeOut = fileSize; + return true; +} + +bool llava_image_embed_make_with_clip_img_ollama(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) { + auto image_embed_slices = llava_image_embed_make_with_bytes_uhd(ctx_clip, n_threads, img); + if (!image_embed_slices[0][0]){ + LOG_TEE("%s: failed to embeding image\n", __func__); + return false; + } + std::string fname = "./examples/minicpm-v2.5/slice_token_for_ollama.raw"; + unsigned char* slice_token; + long image_bytes_length; + auto loaded = load_file_to_bytes(fname.c_str(), &slice_token, &image_bytes_length); + if (!loaded) { + LOG_TEE("%s: failed to load %s\n", __func__, fname.c_str()); + return false; + } + + float * all_image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*61); + int all_n_img_pos=0; + int token_len = clip_n_mmproj_embd(ctx_clip)*sizeof(float); + + std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token, token_len); + std::memcpy(all_image_embd+token_len*all_n_img_pos, image_embed_slices[0][0]->embed, 96*token_len); + all_n_img_pos+=clip_n_patches(ctx_clip); + std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token+token_len, token_len); + if (image_embed_slices.size() > 1) { + std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token+token_len*2, token_len); + for (size_t i = 1; i < image_embed_slices.size(); ++i) { + for (size_t j = 0; j < image_embed_slices[i].size(); ++j) { + std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token, token_len); + std::memcpy(all_image_embd+token_len*all_n_img_pos, image_embed_slices[i][j]->embed, 96*token_len); + all_n_img_pos+=clip_n_patches(ctx_clip); + std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token+token_len, token_len); + if (j == image_embed_slices[i].size() - 1) { + std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token+token_len*4, token_len); + } + } + } + std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token+token_len*3, token_len); + } + *image_embd_out = all_image_embd; + *n_img_pos_out = all_n_img_pos; + return true; +} + +std::vector> llava_image_embed_make_with_filename_uhd(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) { + unsigned char* image_bytes; + long image_bytes_length; + auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length); + if (!loaded) { + LOG_TEE("%s: failed to load %s\n", __func__, image_path); + return std::vector>(); + } + clip_image_u8 * img = clip_image_u8_init(); + if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) { + clip_image_u8_free(img); + LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__); + return std::vector>(); + } + + std::vector> embeds = llava_image_embed_make_with_bytes_uhd(ctx_clip, n_threads, img); + + clip_image_u8_free(img); + free(image_bytes); + return embeds; +} + +void llava_image_embed_free_uhd(std::vector> embed) { + for (size_t i = 0; i < embed.size(); ++i){ + for (size_t j = 0; j < embed[i].size(); ++j){ + free(embed[i][j]->embed); + free(embed[i][j]); + } + embed[i] = std::vector(); + } + embed = std::vector>(); +} \ No newline at end of file diff --git a/examples/minicpmv/minicpmv.h b/examples/minicpmv/minicpmv.h new file mode 100644 index 000000000..dd360920a --- /dev/null +++ b/examples/minicpmv/minicpmv.h @@ -0,0 +1,51 @@ +#ifndef LLAVA_H +#define LLAVA_H + +#include "ggml.h" + +#ifdef LLAMA_SHARED +# if defined(_WIN32) && !defined(__MINGW32__) +# ifdef LLAMA_BUILD +# define MINICPMV_API __declspec(dllexport) +# else +# define MINICPMV_API __declspec(dllimport) +# endif +# else +# define MINICPMV_API __attribute__ ((visibility ("default"))) +# endif +#else +# define MINICPMV_API +#endif + +struct clip_ctx; + +#ifdef __cplusplus +extern "C" { +#endif + +struct llava_image_embed { + float * embed; + int n_image_pos; +}; + +/** sanity check for clip <-> llava embed size match */ +MINICPMV_API bool llava_validate_embed_size(const struct llama_context * ctx_llama, const struct clip_ctx * ctx_clip); + +MINICPMV_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out); + +/** build an image embed from image file bytes */ +MINICPMV_API std::vector> llava_image_embed_make_with_bytes_uhd(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length); +/** build an image embed from a path to an image filename */ +MINICPMV_API bool llava_image_embed_make_with_clip_img_ollama(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out); +MINICPMV_API std::vector> llava_image_embed_make_with_filename_uhd(struct clip_ctx * ctx_clip, int n_threads, const char * image_path); +MINICPMV_API void llava_image_embed_free_uhd(std::vector> embed); +/** free an embedding made with llava_image_embed_make_* */ + +/** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */ +MINICPMV_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/examples/minicpmv/minicpmv_wrapper.cpp b/examples/minicpmv/minicpmv_wrapper.cpp new file mode 100644 index 000000000..5e3a572a8 --- /dev/null +++ b/examples/minicpmv/minicpmv_wrapper.cpp @@ -0,0 +1,147 @@ +#include "ggml.h" +#include "common.h" +#include "clip.h" +#include "minicpmv.h" +#include "minicpmv_wrapper.h" +#include "llama.h" +#include +#include +#include + +struct llama_model * llava_init(gpt_params * params) { + llama_backend_init(); + llama_numa_init(params->numa); + + llama_model_params model_params = llama_model_params_from_gpt_params(*params); + + llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params); + if (model == NULL) { + LOG_TEE("%s: error: unable to load model\n" , __func__); + return NULL; + } + return model; +} + +struct minicpmv_context * llava_init_context(gpt_params * params, llama_model * model) { + const char * clip_path = params->mmproj.c_str(); + + auto prompt = params->prompt; + if (prompt.empty()) { + prompt = "describe the image in detail."; + } + + llama_context_params ctx_params = llama_context_params_from_gpt_params(*params); + ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings + + llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params); + + if (ctx_llama == NULL) { + LOG_TEE("%s: error: failed to create the llama_context\n" , __func__); + return NULL; + } + + auto ctx_llava = (struct minicpmv_context *)malloc(sizeof(minicpmv_context)); + + ctx_llava->ctx_llama = ctx_llama; + ctx_llava->model = model; + return ctx_llava; +} + +void llava_free(struct minicpmv_context * ctx_llava) { + llama_free(ctx_llava->ctx_llama); + llama_free_model(ctx_llava->model); + llama_backend_free(); +} + +struct clip_ctx * clip_init_context(gpt_params * params) { + const char * clip_path = params->mmproj.c_str(); + + auto prompt = params->prompt; + if (prompt.empty()) { + prompt = "describe the image in detail."; + } + std::pair load_image_size = std::make_pair(448, 448); + auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1, load_image_size); + return ctx_clip; +} + +std::vector> minicpmv_image_embed(gpt_params * params, const std::string & fname){ + auto ctx_clip = clip_init_context(params); + auto image_embed_and_slices = llava_image_embed_make_with_filename_slice(ctx_clip, params->n_threads, fname.c_str()); + if (ctx_clip) { + clip_free(ctx_clip); + ctx_clip = NULL; + } + return image_embed_and_slices; +} + + +bool eval_tokens(struct llama_context * ctx_llama, std::vector tokens, int n_batch, int * n_past) { + int N = (int) tokens.size(); + for (int i = 0; i < N; i += n_batch) { + int n_eval = (int) tokens.size() - i; + if (n_eval > n_batch) { + n_eval = n_batch; + } + if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) { + LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past); + return false; + } + *n_past += n_eval; + } + return true; +} + +bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) { + std::vector tokens; + tokens.push_back(id); + return eval_tokens(ctx_llama, tokens, 1, n_past); +} + +bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){ + std::string str2 = str; + std::vector embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true); + eval_tokens(ctx_llama, embd_inp, n_batch, n_past); + return true; +} + +void process_image(struct minicpmv_context * ctx_llava, std::vector> image_embed_slices, gpt_params * params, int &n_past) { + std::string system_prompt; + + system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"; + LOG_TEE("%s: image token past: %d\n", __func__, n_past); + eval_string(ctx_llava->ctx_llama, (system_prompt+"").c_str(), params->n_batch, &n_past, true); + llava_eval_image_embed(ctx_llava->ctx_llama, image_embed_slices[0][0], params->n_batch, &n_past); + eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + if (image_embed_slices.size() > 1) { + eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + for (size_t i = 1; i < image_embed_slices.size(); ++i) { + for (size_t j = 0; j < image_embed_slices[i].size(); ++j) { + eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + llava_eval_image_embed(ctx_llava->ctx_llama, image_embed_slices[i][j], params->n_batch, &n_past); + eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + if (j == image_embed_slices[i].size() - 1) { + eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false); + } + } + } + eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + + } + LOG_TEE("%s: image token past: %d\n", __func__, n_past); +} + +const char * sample(struct llama_sampling_context * ctx_sampling, + struct llama_context * ctx_llama, + int * n_past) { + const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL); + llama_sampling_accept(ctx_sampling, ctx_llama, id, true); + static std::string ret; + if (llama_token_is_eog(llama_get_model(ctx_llama), id)) { + ret = ""; + } else { + ret = llama_token_to_piece(ctx_llama, id); + } + eval_id(ctx_llama, id, n_past); + return ret.c_str(); +} \ No newline at end of file diff --git a/examples/minicpmv/minicpmv_wrapper.h b/examples/minicpmv/minicpmv_wrapper.h new file mode 100644 index 000000000..b3631ed16 --- /dev/null +++ b/examples/minicpmv/minicpmv_wrapper.h @@ -0,0 +1,49 @@ +#ifndef MINICPMV_H +#define MINICPMV_H + +#include "common.h" +#include "clip.h" +#include "minicpmv.h" +#include "llama.h" + +#ifdef LLAMA_SHARED +# if defined(_WIN32) && !defined(__MINGW32__) +# ifdef LLAMA_BUILD +# define MINICPMV_API __declspec(dllexport) +# else +# define MINICPMV_API __declspec(dllimport) +# endif +# else +# define MINICPMV_API __attribute__ ((visibility ("default"))) +# endif +#else +# define MINICPMV_API +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +struct minicpmv_context { + struct llama_context * ctx_llama = NULL; + struct llama_model * model = NULL; +}; + +MINICPMV_API struct llama_model * llava_init(gpt_params * params); +MINICPMV_API struct minicpmv_context * llava_init_context(gpt_params * params, llama_model * model); +MINICPMV_API void llava_free(struct minicpmv_context * ctx_llava); + +MINICPMV_API struct clip_ctx * clip_init_context(gpt_params * params); +MINICPMV_API std::vector> minicpmv_image_embed(gpt_params * params, const std::string & fname); + +MINICPMV_API bool eval_tokens(struct llama_context * ctx_llama, std::vector tokens, int n_batch, int * n_past); +MINICPMV_API bool eval_id(struct llama_context * ctx_llama, int id, int * n_past); +MINICPMV_API bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos); +MINICPMV_API void process_image(struct minicpmv_context * ctx_llava, std::vector> image_embed_slices, gpt_params * params, int &n_past); +MINICPMV_API const char * sample(struct llama_sampling_context * ctx_sampling, struct llama_context * ctx_llama, int * n_past); + +#ifdef __cplusplus +} +#endif + +#endif \ No newline at end of file diff --git a/examples/minicpmv/requirements.txt b/examples/minicpmv/requirements.txt new file mode 100644 index 000000000..f80f727a7 --- /dev/null +++ b/examples/minicpmv/requirements.txt @@ -0,0 +1,3 @@ +-r ../../requirements/requirements-convert.txt +pillow~=10.2.0 +torch~=2.1.1 diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 55ec2cb5c..a3c024c89 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -645,6 +645,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { ], MODEL_ARCH.MINICPM: [ MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT, MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ATTN_NORM, diff --git a/llama.cpp b/llama.cpp index 10c9e47dd..3112db7f5 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1768,24 +1768,17 @@ static llama_state g_state; // available llama models enum e_model { MODEL_UNKNOWN, - MODEL_14M, MODEL_17M, MODEL_22M, MODEL_33M, - MODEL_70M, MODEL_109M, MODEL_137M, - MODEL_160M, MODEL_335M, - MODEL_410M, MODEL_0_5B, MODEL_1B, - MODEL_1_4B, MODEL_2B, - MODEL_2_8B, MODEL_3B, MODEL_4B, - MODEL_6_9B, MODEL_7B, MODEL_8B, MODEL_12B, @@ -1820,7 +1813,6 @@ static const size_t GiB = 1024*MiB; struct llama_hparams { bool vocab_only; bool rope_finetuned; - bool use_par_res; uint32_t n_vocab; uint32_t n_ctx_train; // context size the model was trained on @@ -2585,6 +2577,7 @@ static bool llama_kv_cache_init( static bool llama_kv_cache_find_slot( struct llama_kv_cache & cache, const struct llama_batch & batch) { + const uint32_t n_ctx = cache.size; const uint32_t n_tokens = batch.n_tokens; if (cache.recurrent) { @@ -2635,16 +2628,16 @@ static bool llama_kv_cache_find_slot( } // otherwise, one cell per token. - if (n_tokens > cache.size) { - LLAMA_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size); + if (n_tokens > n_ctx) { + LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx); return false; } uint32_t n_tested = 0; while (true) { - if (cache.head + n_tokens > cache.size) { - n_tested += cache.size - cache.head; + if (cache.head + n_tokens > n_ctx) { + n_tested += n_ctx - cache.head; cache.head = 0; continue; } @@ -2663,7 +2656,7 @@ static bool llama_kv_cache_find_slot( break; } - if (n_tested >= cache.size) { + if (n_tested >= n_ctx) { //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens); return false; } @@ -5173,19 +5166,28 @@ static bool llm_load_tensors( case LLM_ARCH_MINICPM: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); - // output - { - model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); - if (model.arch != LLM_ARCH_MINICPM){ - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); - // if output is NULL, init from the input tok embed - if (model.output == NULL) { - model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); - } - } + // if output is NULL, init from the input tok embed + if (model.output == NULL) { + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); + ml.n_created--; // artificial tensor + ml.size_data += ggml_nbytes(model.output); } + // // output + // { + // model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + // if (model.arch != LLM_ARCH_MINICPM){ + // model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); + // // if output is NULL, init from the input tok embed + // if (model.output == NULL) { + // model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); + // } + // } + // } + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + for (int i = 0; i < n_layer; ++i) { ggml_context * ctx_layer = ctx_for_layer(i); ggml_context * ctx_split = ctx_for_layer_split(i); @@ -10144,7 +10146,9 @@ struct llm_build_context { inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); // scale the input embeddings - inpL = ggml_scale(ctx0, inpL, scale_embd); + if (batch.token) { + inpL = ggml_scale(ctx0, inpL, scale_embd); + } cb(inpL, "inp_scaled", -1); // inp_pos - contains the positions @@ -10260,7 +10264,8 @@ struct llm_build_context { cb(cur, "lmhead_scaling", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.tok_embd, cur); + // cur = ggml_mul_mat(ctx0, model.tok_embd, cur); + cur = ggml_mul_mat(ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -16216,16 +16221,33 @@ struct llama_model * llama_load_model_from_file( } model->rpc_servers.push_back(servers); } - int status = llama_model_load(path_model, *model, params); - GGML_ASSERT(status <= 0); - if (status < 0) { - if (status == -1) { - LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); - } else if (status == -2) { - LLAMA_LOG_INFO("%s: cancelled model load\n", __func__); + // int status = llama_model_load(path_model, *model, params); + // GGML_ASSERT(status <= 0); + // if (status < 0) { + // if (status == -1) { + // LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); + // } else if (status == -2) { + // LLAMA_LOG_INFO("%s: cancelled model load\n", __func__); + // } + // delete model; + // return nullptr; + // } + try { + int status = llama_model_load(path_model, *model, params); + GGML_ASSERT(status <= 0); + if (status < 0) { + if (status == -1) { + LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); + } else if (status == -2) { + LLAMA_LOG_INFO("%s: cancelled model load\n", __func__); + } + delete model; + return nullptr; } + } catch (...) { + LLAMA_LOG_ERROR("%s: exception loading model\n", __func__); delete model; - return nullptr; + throw; } return model; @@ -16612,6 +16634,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { // these models do not use RoPE case LLM_ARCH_GPT2: case LLM_ARCH_GPTJ: + case LLM_ARCH_GPTNEOX: case LLM_ARCH_MPT: case LLM_ARCH_REFACT: case LLM_ARCH_BLOOM: @@ -16649,7 +16672,6 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_PHI3: case LLM_ARCH_GEMMA: case LLM_ARCH_STARCODER2: - case LLM_ARCH_GPTNEOX: return LLAMA_ROPE_TYPE_NEOX; // all model arches should be listed explicitly here