diff --git a/Makefile b/Makefile index f9cd986c7..1b12431b6 100644 --- a/Makefile +++ b/Makefile @@ -950,12 +950,11 @@ llama-llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/lla $(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp) $(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS) -llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp examples/llava/minicpmv-wrapper.h examples/llava/minicpmv-wrapper.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual $(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp) - $(CXX) $(CXXFLAGS) -c examples/llava/minicpmv-wrapper.cpp -o $(call GET_OBJ_FILE, examples/llava/minicpmv-wrapper.cpp) - $(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp examples/llava/minicpmv-wrapper.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) $(call GET_OBJ_FILE, examples/llava/minicpmv-wrapper.cpp) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp $^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS) llama-baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt index 2978225c5..bbf5fec58 100644 --- a/examples/llava/CMakeLists.txt +++ b/examples/llava/CMakeLists.txt @@ -43,8 +43,3 @@ set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-minicpmv-cli) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) - -add_library(minicpmv-wrapper OBJECT - minicpmv-wrapper.cpp -) -target_link_libraries(minicpmv-wrapper PRIVATE llava ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp index 2e7390218..483a0fbb0 100644 --- a/examples/llava/minicpmv-cli.cpp +++ b/examples/llava/minicpmv-cli.cpp @@ -3,7 +3,6 @@ #include "common.h" #include "clip.h" #include "llava.h" -#include "minicpmv-wrapper.h" #include "llama.h" #include @@ -14,6 +13,12 @@ struct uhd_image_embed { std::vector> image_embeds; }; +struct llava_context { + struct clip_ctx * ctx_clip = NULL; + struct llama_context * ctx_llama = NULL; + struct llama_model * model = NULL; +}; + static void show_additional_info(int /*argc*/, char ** argv) { LOG_TEE("\n example usage: %s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n"); @@ -25,7 +30,147 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v LOG_TEE("%s", text); } -static struct minicpmv_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){ +struct llama_model * llava_init(gpt_params * params) { + llama_backend_init(); + llama_numa_init(params->numa); + + llama_model_params model_params = llama_model_params_from_gpt_params(*params); + + llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params); + if (model == NULL) { + LOG_TEE("%s: error: unable to load model\n" , __func__); + return NULL; + } + return model; +} + +struct llava_context * llava_init_context(gpt_params * params, llama_model * model) { + auto prompt = params->prompt; + if (prompt.empty()) { + prompt = "describe the image in detail."; + } + + llama_context_params ctx_params = llama_context_params_from_gpt_params(*params); + if (params->n_ctx < 2048) { + // warn user here, "Image processing requires at least 2048 context, setting context to 2048" + LOG_TEE("%s: warn: Image processing requires at least 2048 context, setting context to 2048\n" , __func__); + ctx_params.n_ctx = 2048; + } else { + ctx_params.n_ctx = params->n_ctx; + } + + llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params); + + if (ctx_llama == NULL) { + LOG_TEE("%s: error: failed to create the llama_context\n" , __func__); + return NULL; + } + + auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context)); + + ctx_llava->ctx_llama = ctx_llama; + ctx_llava->model = model; + return ctx_llava; +} + +void llava_free(struct llava_context * ctx_llava) { + llama_free(ctx_llava->ctx_llama); + llama_free_model(ctx_llava->model); + llama_backend_free(); +} + +struct clip_ctx * clip_init_context(gpt_params * params) { + const char * clip_path = params->mmproj.c_str(); + + auto prompt = params->prompt; + if (prompt.empty()) { + prompt = "describe the image in detail."; + } + auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1); + return ctx_clip; +} + +struct uhd_image_embed * minicpmv_image_embed(gpt_params * params, const std::string & fname){ + auto ctx_clip = clip_init_context(params); + auto image_embed_and_slices = llava_image_embed_make_with_filename_uhd(ctx_clip, params->n_threads, fname.c_str()); + if (ctx_clip) { + clip_free(ctx_clip); + ctx_clip = NULL; + } + return image_embed_and_slices; +} + + +bool eval_tokens(struct llama_context * ctx_llama, std::vector tokens, int n_batch, int * n_past) { + int N = (int) tokens.size(); + for (int i = 0; i < N; i += n_batch) { + int n_eval = (int) tokens.size() - i; + if (n_eval > n_batch) { + n_eval = n_batch; + } + if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) { + LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past); + return false; + } + *n_past += n_eval; + } + return true; +} + +bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) { + std::vector tokens; + tokens.push_back(id); + return eval_tokens(ctx_llama, tokens, 1, n_past); +} + +bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){ + std::string str2 = str; + std::vector embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true); + return eval_tokens(ctx_llama, embd_inp, n_batch, n_past); +} + +void process_image(struct llava_context * ctx_llava, struct uhd_image_embed * image_embed_slices, gpt_params * params, int &n_past) { + std::string system_prompt; + + system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"; + LOG_TEE("%s: image token past: %d\n", __func__, n_past); + eval_string(ctx_llava->ctx_llama, (system_prompt+"").c_str(), params->n_batch, &n_past, false); + llava_eval_image_embed(ctx_llava->ctx_llama, image_embed_slices->image_embeds[0][0], params->n_batch, &n_past); + eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + if (image_embed_slices->image_embeds.size() > 1) { + eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + for (size_t i = 1; i < image_embed_slices->image_embeds.size(); ++i) { + for (size_t j = 0; j < image_embed_slices->image_embeds[i].size(); ++j) { + eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + llava_eval_image_embed(ctx_llava->ctx_llama, image_embed_slices->image_embeds[i][j], params->n_batch, &n_past); + eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + if (j == image_embed_slices->image_embeds[i].size() - 1) { + eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false); + } + } + } + eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + + } + LOG_TEE("%s: image token past: %d\n", __func__, n_past); +} + +const char * sample(struct llama_sampling_context * ctx_sampling, + struct llama_context * ctx_llama, + int * n_past) { + const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL); + llama_sampling_accept(ctx_sampling, ctx_llama, id, true); + static std::string ret; + if (llama_token_is_eog(llama_get_model(ctx_llama), id)) { + ret = ""; + } else { + ret = llama_token_to_piece(ctx_llama, id); + } + eval_id(ctx_llama, id, n_past); + return ret.c_str(); +} + +static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){ auto embeds = minicpmv_image_embed(params, fname); auto image_embed_slices = embeds->image_embeds; if (!image_embed_slices[0][0]) { @@ -61,7 +206,7 @@ static struct minicpmv_context * minicpmv_init(gpt_params * params, const std::s return ctx_llava; } -static struct llama_sampling_context * llama_init(struct minicpmv_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){ +static struct llama_sampling_context * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){ std::string user_prompt = prompt; if (!is_first) user_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + prompt; diff --git a/examples/llava/minicpmv-wrapper.cpp b/examples/llava/minicpmv-wrapper.cpp deleted file mode 100644 index 79df92c62..000000000 --- a/examples/llava/minicpmv-wrapper.cpp +++ /dev/null @@ -1,153 +0,0 @@ -#include "ggml.h" -#include "common.h" -#include "clip.h" -#include "llava.h" -#include "minicpmv-wrapper.h" -#include "llama.h" -#include -#include -#include - -struct uhd_image_embed { - std::vector> image_embeds; -}; - -struct llama_model * llava_init(gpt_params * params) { - llama_backend_init(); - llama_numa_init(params->numa); - - llama_model_params model_params = llama_model_params_from_gpt_params(*params); - - llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params); - if (model == NULL) { - LOG_TEE("%s: error: unable to load model\n" , __func__); - return NULL; - } - return model; -} - -struct minicpmv_context * llava_init_context(gpt_params * params, llama_model * model) { - auto prompt = params->prompt; - if (prompt.empty()) { - prompt = "describe the image in detail."; - } - - llama_context_params ctx_params = llama_context_params_from_gpt_params(*params); - if (params->n_ctx < 2048) { - // warn user here, "Image processing requires at least 2048 context, setting context to 2048" - LOG_TEE("%s: warn: Image processing requires at least 2048 context, setting context to 2048\n" , __func__); - ctx_params.n_ctx = 2048; - } else { - ctx_params.n_ctx = params->n_ctx; - } - - llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params); - - if (ctx_llama == NULL) { - LOG_TEE("%s: error: failed to create the llama_context\n" , __func__); - return NULL; - } - - auto ctx_llava = (struct minicpmv_context *)malloc(sizeof(minicpmv_context)); - - ctx_llava->ctx_llama = ctx_llama; - ctx_llava->model = model; - return ctx_llava; -} - -void llava_free(struct minicpmv_context * ctx_llava) { - llama_free(ctx_llava->ctx_llama); - llama_free_model(ctx_llava->model); - llama_backend_free(); -} - -struct clip_ctx * clip_init_context(gpt_params * params) { - const char * clip_path = params->mmproj.c_str(); - - auto prompt = params->prompt; - if (prompt.empty()) { - prompt = "describe the image in detail."; - } - auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1); - return ctx_clip; -} - -struct uhd_image_embed * minicpmv_image_embed(gpt_params * params, const std::string & fname){ - auto ctx_clip = clip_init_context(params); - auto image_embed_and_slices = llava_image_embed_make_with_filename_uhd(ctx_clip, params->n_threads, fname.c_str()); - if (ctx_clip) { - clip_free(ctx_clip); - ctx_clip = NULL; - } - return image_embed_and_slices; -} - - -bool eval_tokens(struct llama_context * ctx_llama, std::vector tokens, int n_batch, int * n_past) { - int N = (int) tokens.size(); - for (int i = 0; i < N; i += n_batch) { - int n_eval = (int) tokens.size() - i; - if (n_eval > n_batch) { - n_eval = n_batch; - } - if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) { - LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past); - return false; - } - *n_past += n_eval; - } - return true; -} - -bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) { - std::vector tokens; - tokens.push_back(id); - return eval_tokens(ctx_llama, tokens, 1, n_past); -} - -bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){ - std::string str2 = str; - std::vector embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true); - return eval_tokens(ctx_llama, embd_inp, n_batch, n_past); -} - -void process_image(struct minicpmv_context * ctx_llava, struct uhd_image_embed * image_embed_slices, gpt_params * params, int &n_past) { - std::string system_prompt; - - system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"; - LOG_TEE("%s: image token past: %d\n", __func__, n_past); - eval_string(ctx_llava->ctx_llama, (system_prompt+"").c_str(), params->n_batch, &n_past, false); - llava_eval_image_embed(ctx_llava->ctx_llama, image_embed_slices->image_embeds[0][0], params->n_batch, &n_past); - eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); - if (image_embed_slices->image_embeds.size() > 1) { - eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); - for (size_t i = 1; i < image_embed_slices->image_embeds.size(); ++i) { - for (size_t j = 0; j < image_embed_slices->image_embeds[i].size(); ++j) { - eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); - llava_eval_image_embed(ctx_llava->ctx_llama, image_embed_slices->image_embeds[i][j], params->n_batch, &n_past); - eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); - if (j == image_embed_slices->image_embeds[i].size() - 1) { - eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false); - } - } - } - eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); - - } - LOG_TEE("%s: image token past: %d\n", __func__, n_past); -} - -const char * sample(struct llama_sampling_context * ctx_sampling, - struct llama_context * ctx_llama, - int * n_past) { - const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL); - llama_sampling_accept(ctx_sampling, ctx_llama, id, true); - static std::string ret; - if (llama_token_is_eog(llama_get_model(ctx_llama), id)) { - ret = ""; - } else { - ret = llama_token_to_piece(ctx_llama, id); - } - eval_id(ctx_llama, id, n_past); - return ret.c_str(); -} \ No newline at end of file diff --git a/examples/llava/minicpmv-wrapper.h b/examples/llava/minicpmv-wrapper.h deleted file mode 100644 index c65dacf80..000000000 --- a/examples/llava/minicpmv-wrapper.h +++ /dev/null @@ -1,49 +0,0 @@ -#ifndef MINICPMV_H -#define MINICPMV_H - -#include "common.h" -#include "clip.h" -#include "llava.h" -#include "llama.h" - -#ifdef LLAMA_SHARED -# if defined(_WIN32) && !defined(__MINGW32__) -# ifdef LLAMA_BUILD -# define MINICPMV_API __declspec(dllexport) -# else -# define MINICPMV_API __declspec(dllimport) -# endif -# else -# define MINICPMV_API __attribute__ ((visibility ("default"))) -# endif -#else -# define MINICPMV_API -#endif - -bool eval_tokens(struct llama_context * ctx_llama, std::vector tokens, int n_batch, int * n_past); -bool eval_id(struct llama_context * ctx_llama, int id, int * n_past); -bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos); -void process_image(struct minicpmv_context * ctx_llava, struct uhd_image_embed * image_embed_slices, gpt_params * params, int &n_past); -const char * sample(struct llama_sampling_context * ctx_sampling, struct llama_context * ctx_llama, int * n_past); - -#ifdef __cplusplus -extern "C" { -#endif - -struct minicpmv_context { - struct llama_context * ctx_llama = NULL; - struct llama_model * model = NULL; -}; - -MINICPMV_API struct llama_model * llava_init(gpt_params * params); -MINICPMV_API struct minicpmv_context * llava_init_context(gpt_params * params, llama_model * model); -MINICPMV_API void llava_free(struct minicpmv_context * ctx_llava); - -MINICPMV_API struct clip_ctx * clip_init_context(gpt_params * params); -MINICPMV_API struct uhd_image_embed * minicpmv_image_embed(gpt_params * params, const std::string & fname); - -#ifdef __cplusplus -} -#endif - -#endif \ No newline at end of file