From 9716c7bff7e4208ed116acd721abcbf8c71d0071 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 22 Jan 2025 14:40:35 +0100 Subject: [PATCH] temporary refactor llama_vision_graph_builder --- examples/vision/vision.cpp | 8 +- src/llama-model.cpp | 5 + src/llama-vision.cpp | 279 +++++++++++++++++++++---------------- 3 files changed, 173 insertions(+), 119 deletions(-) diff --git a/examples/vision/vision.cpp b/examples/vision/vision.cpp index 88b5be5bb..d8f8d0f11 100644 --- a/examples/vision/vision.cpp +++ b/examples/vision/vision.cpp @@ -50,7 +50,7 @@ static llama_vision_bitmap * load_image_from_file(const char * fname) { } // split string by a `std::string delim` instead of `char delim` -static std::vector string_split(std::string s, const std::string & delimiter) { +static std::vector string_split_str(std::string s, const std::string & delimiter) { std::vector tokens; size_t pos = 0; std::string token; @@ -76,7 +76,7 @@ static std::vector tokenize_with_img_placement( const std::string & text, bool add_special, bool parse_special) { - std::vector parts = string_split(text, IMG_PLACEMENT); + std::vector parts = string_split_str(text, IMG_PLACEMENT); std::vector output; for (const auto & part : parts) { //printf("tokenizing part: %s\n", part.c_str()); @@ -114,6 +114,10 @@ int main(int argc, char ** argv) { llama_context * ctx = llama_init.context.get(); const llama_model * model = llama_init.model.get(); const llama_vocab * vocab = llama_model_get_vocab(model); + if (!model) { + LOG_ERR("failed to load model\n"); + return 1; + } struct common_sampler * smpl = common_sampler_init(model, params.sampling); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 9d30f636e..d8ff9c0e5 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -4056,6 +4056,11 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) { case LLM_ARCH_QWEN2VL: return LLAMA_ROPE_TYPE_MROPE; + case LLM_ARCH_VISION_LLAVA: + case LLM_ARCH_VISION_MOBILEVLM: + case LLM_ARCH_VISION_MINICPMV: + GGML_ABORT("vision arch does not use RoPE"); + // all model arches should be listed explicitly here case LLM_ARCH_UNKNOWN: GGML_ABORT("unknown architecture"); diff --git a/src/llama-vision.cpp b/src/llama-vision.cpp index c583593b7..3cae05d6b 100644 --- a/src/llama-vision.cpp +++ b/src/llama-vision.cpp @@ -19,6 +19,8 @@ struct img_size; static int bmp_export(const struct llama_image_u8 &img, const std::string &location); #endif +#define VISION_GRAPH_MAX_NODE 1024 + struct img_size { int width; int height; @@ -403,7 +405,7 @@ struct llama_vision_processor_llava : llama_vision_processor { output_slices.buf[0] = std::move(res); return output_slices; - }; + } }; struct llama_vision_processor_uhd : llama_vision_processor { @@ -572,33 +574,56 @@ struct llama_vision_processor_uhd : llama_vision_processor { } }; -static ggml_cgraph * llama_vision_build_graph(llama_vision_context & ctx, int batch_size, img_size & image_size) { - auto & model = *ctx.model; - auto & hparams = ctx.model->hparams; +// TODO: move this to llm_build_context in llama.cpp +struct llama_vision_graph_builder { + llama_vision_context & ctx; + const llama_vision_model & model; + struct ggml_context * ctx0; + int batch_size; + int hidden_size; + int n_head; + int d_head; + int patch_size; + float eps; + int num_patches; + int num_positions; + int img_w; + int img_h; + bool use_gelu; + int n_layers; + vision_projector_type proj_type; - const int hidden_size = hparams.hidden_size; - const int n_head = hparams.n_head; - const int d_head = hidden_size / n_head; - const int patch_size = hparams.patch_size; - const float eps = hparams.eps; - const int num_patches = ((image_size.width / patch_size) * (image_size.height / patch_size)); - const int num_positions = num_patches + (model.class_embedding ? 1 : 0); + llama_vision_graph_builder(llama_vision_context & ctx, const llama_vision_tokens & inp) : ctx(ctx), model(*ctx.model) { + struct ggml_init_params params = { + /*.mem_size =*/ ctx.buf_compute_meta.size(), + /*.mem_buffer =*/ ctx.buf_compute_meta.data(), + /*.no_alloc =*/ true, + }; + ctx0 = ggml_init(params); - LLAMA_LOG_DEBUG("%s: num_patches = %d\n", __func__, num_patches); + auto & hparams = ctx.model->hparams; - struct ggml_init_params params = { - /*.mem_size =*/ ctx.buf_compute_meta.size(), - /*.mem_buffer =*/ ctx.buf_compute_meta.data(), - /*.no_alloc =*/ true, - }; + batch_size = inp.buf.size(); + hidden_size = hparams.hidden_size; + n_head = hparams.n_head; + d_head = hidden_size / n_head; + patch_size = hparams.patch_size; + eps = hparams.eps; + num_patches = inp.n_px * inp.n_py; + num_positions = num_patches + (model.class_embedding ? 1 : 0); + img_w = inp.px * inp.n_px; + img_h = inp.py * inp.n_py; + use_gelu = hparams.use_gelu; + n_layers = (int)hparams.n_layer + hparams.select_layer; + proj_type = hparams.proj_type; + } - struct ggml_context * ctx0 = ggml_init(params); - struct ggml_cgraph * gf = ggml_new_graph(ctx0); + ~llama_vision_graph_builder() { + ggml_free(ctx0); + } - // input - struct ggml_tensor * embeddings; - { - struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size.width, image_size.height, 3, batch_size); + struct ggml_tensor * build_inp() { + struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, img_w, img_h, 3, batch_size); ggml_set_name(inp_raw, "inp_raw"); ggml_set_input(inp_raw); @@ -612,37 +637,51 @@ static ggml_cgraph * llama_vision_build_graph(llama_vision_context & ctx, int ba } // auto * ne = inp->ne; printf("%d %d %d %d\n", ne[0], ne[1], ne[2], ne[3]); - embeddings = inp; + struct ggml_tensor * embd = inp; if (model.class_embedding) { - embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); - ggml_set_name(embeddings, "embeddings"); - ggml_set_input(embeddings); - embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, - embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); - embeddings = ggml_acc(ctx0, embeddings, inp, - embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); + embd = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); + ggml_set_name(embd, "inp_embd"); + ggml_set_input(embd); + + embd = ggml_acc(ctx0, embd, model.class_embedding, + embd->nb[1], embd->nb[2], embd->nb[3], 0); + embd = ggml_acc(ctx0, embd, inp, + embd->nb[1], embd->nb[2], embd->nb[3], model.class_embedding->nb[1]); } struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions); - ggml_set_name(positions, "positions"); + ggml_set_name(positions, "inp_pos"); ggml_set_input(positions); - embeddings = ggml_add(ctx0, - embeddings, + embd = ggml_add(ctx0, + embd, ggml_get_rows(ctx0, model.position_embeddings, positions)); + + return embd; } - // pre-layernorm - if (model.pre_norm_w) { - embeddings = ggml_norm(ctx0, embeddings, eps); - ggml_set_name(embeddings, "pre_ln"); + struct ggml_tensor * build_pre_norm(struct ggml_tensor * cur) { + if (model.pre_norm_w) { + cur = ggml_norm(ctx0, cur, eps); + ggml_set_name(cur, "pre_ln"); - embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_norm_w), model.pre_norm_b); + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.pre_norm_w), model.pre_norm_b); + } + return cur; } - // loop over layers - for (int il = 0; il < (int)hparams.n_layer + hparams.select_layer; il++) { - struct ggml_tensor * cur = embeddings; + struct ggml_tensor * build_post_norm(struct ggml_tensor * cur) { + if (model.post_norm_w) { + cur = ggml_norm(ctx0, cur, eps); + ggml_set_name(cur, "post_ln"); + + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.post_norm_w), model.post_norm_b); + } + return cur; + } + + struct ggml_tensor * build_layer(struct ggml_tensor * inpL, int il) { + struct ggml_tensor * cur = inpL; // layernorm1 { @@ -654,7 +693,6 @@ static ggml_cgraph * llama_vision_build_graph(llama_vision_context & ctx, int ba // self-attention { - struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b); @@ -693,9 +731,9 @@ static ggml_cgraph * llama_vision_build_graph(llama_vision_context & ctx, int ba cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].output_w, cur), model.layers[il].output_b); // re-add the layer input, e.g., residual - cur = ggml_add(ctx0, cur, embeddings); + cur = ggml_add(ctx0, cur, inpL); - embeddings = cur; // embeddings = residual, cur = hidden_states + inpL = cur; // inpL = residual, cur = hidden_states // layernorm2 { @@ -708,7 +746,7 @@ static ggml_cgraph * llama_vision_build_graph(llama_vision_context & ctx, int ba cur = ggml_mul_mat(ctx0, model.layers[il].ffn_up_w, cur); cur = ggml_add(ctx0, cur, model.layers[il].ffn_up_b); - if (hparams.use_gelu) { + if (use_gelu) { cur = ggml_gelu_inplace(ctx0, cur); } else { cur = ggml_gelu_quick_inplace(ctx0, cur); @@ -718,74 +756,76 @@ static ggml_cgraph * llama_vision_build_graph(llama_vision_context & ctx, int ba cur = ggml_add(ctx0, cur, model.layers[il].ffn_down_b); // residual 2 - cur = ggml_add(ctx0, embeddings, cur); + cur = ggml_add(ctx0, inpL, cur); - embeddings = cur; + return cur; } - // post-layernorm - if (model.post_norm_w) { - embeddings = ggml_norm(ctx0, embeddings, eps); - ggml_set_name(embeddings, "post_ln"); + // graph for each vision arch - embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_norm_w), model.post_norm_b); - } - - // llava projector - { - embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]); - - struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches); - ggml_set_name(patches, "patches"); - ggml_set_input(patches); - - // shape [1, 576, 1024] - // ne is whcn, ne = [1024, 576, 1, 1] - embeddings = ggml_get_rows(ctx0, embeddings, patches); - - if (hparams.proj_type == VISION_PROJECTOR_TYPE_MLP) { - embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings); - embeddings = ggml_add(ctx0, embeddings, model.mm_1_b); - - embeddings = ggml_gelu(ctx0, embeddings); - embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); - embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); - - } else if (hparams.proj_type == VISION_PROJECTOR_TYPE_LDPV2) { - int n_patch = 24; - struct ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings); - mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b); - mlp_0 = ggml_gelu(ctx0, mlp_0); - struct ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0); - mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b); - // mlp_2 ne = [2048, 576, 1, 1] - // // AVG Pool Layer 2*2, strides = 2 - mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 0, 2, 3)); - // mlp_2 ne = [576, 2048, 1, 1] - mlp_2 = ggml_reshape_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]); - // mlp_2 ne [24, 24, 2048, 1] - mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0); - // weight ne = [3, 3, 2048, 1] - struct ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1); - peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3)); - peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b); - mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3)); - peg_0 = ggml_add(ctx0, peg_0, mlp_2); - peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]); - embeddings = peg_0; - - } else { - GGML_ASSERT(false && "unsupported proj type"); + struct ggml_cgraph * build_llava() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, VISION_GRAPH_MAX_NODE, false); + struct ggml_tensor * cur = build_inp(); + cur = build_pre_norm(cur); + for (int il = 0; il < n_layers; il++) { + cur = build_layer(cur, il); } + cur = build_post_norm(cur); + + // llava projector + { + cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1]); + + struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches); + ggml_set_name(patches, "inp_patches"); + ggml_set_input(patches); + + // shape [1, 576, 1024] + // ne is whcn, ne = [1024, 576, 1, 1] + cur = ggml_get_rows(ctx0, cur, patches); + + if (proj_type == VISION_PROJECTOR_TYPE_MLP) { + cur = ggml_mul_mat(ctx0, model.mm_1_w, cur); + cur = ggml_add(ctx0, cur, model.mm_1_b); + + cur = ggml_gelu(ctx0, cur); + cur = ggml_mul_mat(ctx0, model.mm_2_w, cur); + cur = ggml_add(ctx0, cur, model.mm_2_b); + + } else if (proj_type == VISION_PROJECTOR_TYPE_LDPV2) { + int n_patch = 24; + struct ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, cur); + mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b); + mlp_0 = ggml_gelu(ctx0, mlp_0); + struct ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0); + mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b); + // mlp_2 ne = [2048, 576, 1, 1] + // // AVG Pool Layer 2*2, strides = 2 + mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 0, 2, 3)); + // mlp_2 ne = [576, 2048, 1, 1] + mlp_2 = ggml_reshape_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]); + // mlp_2 ne [24, 24, 2048, 1] + mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0); + // weight ne = [3, 3, 2048, 1] + struct ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1); + peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3)); + peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b); + mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3)); + peg_0 = ggml_add(ctx0, peg_0, mlp_2); + peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]); + cur = ggml_cont(ctx0, peg_0); + + } else { + GGML_ASSERT(false && "unsupported proj type"); + } + } + + ggml_set_name(cur, "output"); + ggml_build_forward_expand(gf, cur); + + return gf; } - - embeddings = ggml_cont(ctx0, embeddings); - - // build the graph - ggml_build_forward_expand(gf, embeddings); - ggml_free(ctx0); - return gf; -} +}; static int32_t llama_vision_encode_impl(llama_vision_context & ctx, const llama_vision_tokens & inp) { int batch_size = inp.buf.size(); @@ -805,7 +845,16 @@ static int32_t llama_vision_encode_impl(llama_vision_context & ctx, const llama_ LLAMA_LOG_DEBUG("%s: num_positions = %d\n", __func__, num_positions); // build the inference graph - ggml_cgraph * gf = llama_vision_build_graph(ctx, batch_size, image_size); + llama_vision_graph_builder builder(ctx, inp); + ggml_cgraph * gf; + switch(hparams.arch) { + case LLM_ARCH_VISION_LLAVA: + case LLM_ARCH_VISION_MOBILEVLM: + gf = builder.build_llava(); + break; + default: + GGML_ASSERT(false && "unsupported arch"); + } // alloc memory for graph bool ok = ggml_backend_sched_alloc_graph(ctx.sched, gf); @@ -839,16 +888,12 @@ static int32_t llama_vision_encode_impl(llama_vision_context & ctx, const llama_ } if (model.class_embedding) { - struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings"); - - void* zero_mem = malloc(ggml_nbytes(embeddings)); - memset(zero_mem, 0, ggml_nbytes(embeddings)); - ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings)); - free(zero_mem); + struct ggml_tensor * inp_embd = ggml_graph_get_tensor(gf, "inp_embd"); + ggml_set_zero(inp_embd); } { - struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); + struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "inp_pos"); int* positions_data = (int*)malloc(ggml_nbytes(positions)); for (int i = 0; i < num_positions; i++) { @@ -859,7 +904,7 @@ static int32_t llama_vision_encode_impl(llama_vision_context & ctx, const llama_ } { - struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches"); + struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "inp_patches"); int* patches_data = (int*)malloc(ggml_nbytes(patches)); for (int i = 0; i < num_patches; i++) { patches_data[i] = i + 1;