mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-23 18:09:18 +01:00
llava-cli : multiple images (#6969)
Co-authored-by: root <root@nenya.lothlorien.ca>
This commit is contained in:
parent
24affa7db3
commit
ffe666572f
@ -893,7 +893,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
params.image = argv[i];
|
params.image.emplace_back(argv[i]);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (arg == "-i" || arg == "--interactive") {
|
if (arg == "-i" || arg == "--interactive") {
|
||||||
@ -1495,7 +1495,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||||||
printf(" -ps N, --p-split N speculative decoding split probability (default: %.1f)\n", (double)params.p_split);
|
printf(" -ps N, --p-split N speculative decoding split probability (default: %.1f)\n", (double)params.p_split);
|
||||||
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
|
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
|
||||||
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
|
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
|
||||||
printf(" --image IMAGE_FILE path to an image file. use with multimodal models\n");
|
printf(" --image IMAGE_FILE path to an image file. use with multimodal models. Specify multiple times for batching\n");
|
||||||
if (llama_supports_mlock()) {
|
if (llama_supports_mlock()) {
|
||||||
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
||||||
}
|
}
|
||||||
|
@ -168,7 +168,7 @@ struct gpt_params {
|
|||||||
|
|
||||||
// multimodal models (see examples/llava)
|
// multimodal models (see examples/llava)
|
||||||
std::string mmproj = ""; // path to multimodal projector
|
std::string mmproj = ""; // path to multimodal projector
|
||||||
std::string image = ""; // path to an image file
|
std::vector<std::string> image; // path to image file(s)
|
||||||
};
|
};
|
||||||
|
|
||||||
bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
||||||
|
@ -113,11 +113,11 @@ struct llava_context {
|
|||||||
};
|
};
|
||||||
|
|
||||||
static void show_additional_info(int /*argc*/, char ** argv) {
|
static void show_additional_info(int /*argc*/, char ** argv) {
|
||||||
LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
||||||
LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n");
|
LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params) {
|
static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) {
|
||||||
|
|
||||||
// load and preprocess the image
|
// load and preprocess the image
|
||||||
llava_image_embed * embed = NULL;
|
llava_image_embed * embed = NULL;
|
||||||
@ -133,9 +133,9 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
|
|||||||
}
|
}
|
||||||
params->prompt = remove_image_from_prompt(prompt);
|
params->prompt = remove_image_from_prompt(prompt);
|
||||||
} else {
|
} else {
|
||||||
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, params->image.c_str());
|
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, fname.c_str());
|
||||||
if (!embed) {
|
if (!embed) {
|
||||||
LOG_TEE("%s: is %s really an image file?\n", __func__, params->image.c_str());
|
fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -207,17 +207,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
|||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static struct llama_model * llava_init(gpt_params * params) {
|
||||||
static struct llava_context * llava_init(gpt_params * params) {
|
|
||||||
const char * clip_path = params->mmproj.c_str();
|
|
||||||
|
|
||||||
auto prompt = params->prompt;
|
|
||||||
if (prompt.empty()) {
|
|
||||||
prompt = "describe the image in detail.";
|
|
||||||
}
|
|
||||||
|
|
||||||
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
|
|
||||||
|
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params->numa);
|
llama_numa_init(params->numa);
|
||||||
|
|
||||||
@ -228,6 +218,19 @@ static struct llava_context * llava_init(gpt_params * params) {
|
|||||||
LOG_TEE("%s: error: unable to load model\n" , __func__);
|
LOG_TEE("%s: error: unable to load model\n" , __func__);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
return model;
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct llava_context * llava_init_context(gpt_params * params, llama_model * model) {
|
||||||
|
const char * clip_path = params->mmproj.c_str();
|
||||||
|
|
||||||
|
auto prompt = params->prompt;
|
||||||
|
if (prompt.empty()) {
|
||||||
|
prompt = "describe the image in detail.";
|
||||||
|
}
|
||||||
|
|
||||||
|
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
|
||||||
|
|
||||||
|
|
||||||
llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
|
llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
|
||||||
ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
|
ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
|
||||||
@ -286,15 +289,18 @@ int main(int argc, char ** argv) {
|
|||||||
show_additional_info(argc, argv);
|
show_additional_info(argc, argv);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
auto model = llava_init(¶ms);
|
||||||
auto ctx_llava = llava_init(¶ms);
|
if (model == NULL) {
|
||||||
if (ctx_llava == NULL) {
|
fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
|
||||||
LOG_TEE("%s: error: failed to init llava\n", __func__);
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto image_embed = load_image(ctx_llava, ¶ms);
|
for (auto & image : params.image) {
|
||||||
|
auto ctx_llava = llava_init_context(¶ms, model);
|
||||||
|
|
||||||
|
auto image_embed = load_image(ctx_llava, ¶ms, image);
|
||||||
if (!image_embed) {
|
if (!image_embed) {
|
||||||
|
std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -302,8 +308,11 @@ int main(int argc, char ** argv) {
|
|||||||
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
||||||
|
|
||||||
llama_print_timings(ctx_llava->ctx_llama);
|
llama_print_timings(ctx_llava->ctx_llama);
|
||||||
|
|
||||||
llava_image_embed_free(image_embed);
|
llava_image_embed_free(image_embed);
|
||||||
|
ctx_llava->model = NULL;
|
||||||
llava_free(ctx_llava);
|
llava_free(ctx_llava);
|
||||||
|
}
|
||||||
|
llama_free_model(model);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user