2023-11-06 22:36:23 +01:00
# include "ggml.h"
2024-04-21 08:19:04 -04:00
# include "log.h"
2023-11-06 22:36:23 +01:00
# include "common.h"
# include "clip.h"
# include "llava.h"
# include "llama.h"
# include "base64.hpp"
# include <cstdio>
# include <cstdlib>
# include <vector>
static bool eval_tokens ( struct llama_context * ctx_llama , std : : vector < llama_token > tokens , int n_batch , int * n_past ) {
int N = ( int ) tokens . size ( ) ;
for ( int i = 0 ; i < N ; i + = n_batch ) {
int n_eval = ( int ) tokens . size ( ) - i ;
if ( n_eval > n_batch ) {
n_eval = n_batch ;
}
if ( llama_decode ( ctx_llama , llama_batch_get_one ( & tokens [ i ] , n_eval , * n_past , 0 ) ) ) {
2024-04-21 08:19:04 -04:00
LOG_TEE ( " %s : failed to eval. token %d/%d (batch size %d, n_past %d) \n " , __func__ , i , N , n_batch , * n_past ) ;
2023-11-06 22:36:23 +01:00
return false ;
}
* n_past + = n_eval ;
}
return true ;
}
static bool eval_id ( struct llama_context * ctx_llama , int id , int * n_past ) {
std : : vector < llama_token > tokens ;
tokens . push_back ( id ) ;
return eval_tokens ( ctx_llama , tokens , 1 , n_past ) ;
}
static bool eval_string ( struct llama_context * ctx_llama , const char * str , int n_batch , int * n_past , bool add_bos ) {
std : : string str2 = str ;
2024-02-07 02:17:25 -06:00
std : : vector < llama_token > embd_inp = : : llama_tokenize ( ctx_llama , str2 , add_bos , true ) ;
2023-11-06 22:36:23 +01:00
eval_tokens ( ctx_llama , embd_inp , n_batch , n_past ) ;
return true ;
}
2023-12-29 06:38:38 -08:00
static const char * sample ( struct llama_sampling_context * ctx_sampling ,
struct llama_context * ctx_llama ,
int * n_past ) {
const llama_token id = llama_sampling_sample ( ctx_sampling , ctx_llama , NULL ) ;
llama_sampling_accept ( ctx_sampling , ctx_llama , id , true ) ;
2023-11-06 22:36:23 +01:00
static std : : string ret ;
2024-04-21 13:50:41 +02:00
if ( llama_token_is_eog ( llama_get_model ( ctx_llama ) , id ) ) {
2023-11-06 22:36:23 +01:00
ret = " </s> " ;
} else {
ret = llama_token_to_piece ( ctx_llama , id ) ;
}
eval_id ( ctx_llama , id , n_past ) ;
return ret . c_str ( ) ;
}
static const char * IMG_BASE64_TAG_BEGIN = " <img src= \" data:image/jpeg;base64, " ;
static const char * IMG_BASE64_TAG_END = " \" > " ;
static void find_image_tag_in_prompt ( const std : : string & prompt , size_t & begin_out , size_t & end_out ) {
begin_out = prompt . find ( IMG_BASE64_TAG_BEGIN ) ;
end_out = prompt . find ( IMG_BASE64_TAG_END , ( begin_out = = std : : string : : npos ) ? 0UL : begin_out ) ;
}
static bool prompt_contains_image ( const std : : string & prompt ) {
size_t begin , end ;
find_image_tag_in_prompt ( prompt , begin , end ) ;
return ( begin ! = std : : string : : npos ) ;
}
// replaces the base64 image tag in the prompt with `replacement`
static llava_image_embed * llava_image_embed_make_with_prompt_base64 ( struct clip_ctx * ctx_clip , int n_threads , const std : : string & prompt ) {
size_t img_base64_str_start , img_base64_str_end ;
find_image_tag_in_prompt ( prompt , img_base64_str_start , img_base64_str_end ) ;
if ( img_base64_str_start = = std : : string : : npos | | img_base64_str_end = = std : : string : : npos ) {
2024-04-21 08:19:04 -04:00
LOG_TEE ( " %s: invalid base64 image tag. must be %s<base64 byte string>%s \n " , __func__ , IMG_BASE64_TAG_BEGIN , IMG_BASE64_TAG_END ) ;
2023-11-06 22:36:23 +01:00
return NULL ;
}
auto base64_bytes_start = img_base64_str_start + strlen ( IMG_BASE64_TAG_BEGIN ) ;
auto base64_bytes_count = img_base64_str_end - base64_bytes_start ;
auto base64_str = prompt . substr ( base64_bytes_start , base64_bytes_count ) ;
auto required_bytes = base64 : : required_encode_size ( base64_str . size ( ) ) ;
auto img_bytes = std : : vector < unsigned char > ( required_bytes ) ;
base64 : : decode ( base64_str . begin ( ) , base64_str . end ( ) , img_bytes . begin ( ) ) ;
auto embed = llava_image_embed_make_with_bytes ( ctx_clip , n_threads , img_bytes . data ( ) , img_bytes . size ( ) ) ;
if ( ! embed ) {
2024-04-21 08:19:04 -04:00
LOG_TEE ( " %s: could not load image from base64 string. \n " , __func__ ) ;
2023-11-06 22:36:23 +01:00
return NULL ;
}
return embed ;
}
static std : : string remove_image_from_prompt ( const std : : string & prompt , const char * replacement = " " ) {
size_t begin , end ;
find_image_tag_in_prompt ( prompt , begin , end ) ;
if ( begin = = std : : string : : npos | | end = = std : : string : : npos ) {
return prompt ;
}
auto pre = prompt . substr ( 0 , begin ) ;
auto post = prompt . substr ( end + strlen ( IMG_BASE64_TAG_END ) ) ;
return pre + replacement + post ;
}
struct llava_context {
struct clip_ctx * ctx_clip = NULL ;
struct llama_context * ctx_llama = NULL ;
struct llama_model * model = NULL ;
} ;
static void show_additional_info ( int /*argc*/ , char * * argv ) {
2024-04-29 07:34:24 -07:00
LOG_TEE ( " \n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \" describe the image in detail. \" ] \n " , argv [ 0 ] ) ;
2024-04-21 08:19:04 -04:00
LOG_TEE ( " note: a lower temperature value like 0.1 is recommended for better quality. \n " ) ;
2023-11-06 22:36:23 +01:00
}
2024-04-29 07:34:24 -07:00
static struct llava_image_embed * load_image ( llava_context * ctx_llava , gpt_params * params , const std : : string & fname ) {
2023-11-06 22:36:23 +01:00
// load and preprocess the image
llava_image_embed * embed = NULL ;
auto prompt = params - > prompt ;
if ( prompt_contains_image ( prompt ) ) {
if ( ! params - > image . empty ( ) ) {
2024-04-21 08:19:04 -04:00
LOG_TEE ( " using base64 encoded image instead of command line image path \n " ) ;
2023-11-06 22:36:23 +01:00
}
embed = llava_image_embed_make_with_prompt_base64 ( ctx_llava - > ctx_clip , params - > n_threads , prompt ) ;
if ( ! embed ) {
2024-04-21 08:19:04 -04:00
LOG_TEE ( " %s: can't load image from prompt \n " , __func__ ) ;
2023-11-06 22:36:23 +01:00
return NULL ;
}
params - > prompt = remove_image_from_prompt ( prompt ) ;
} else {
2024-04-29 07:34:24 -07:00
embed = llava_image_embed_make_with_filename ( ctx_llava - > ctx_clip , params - > n_threads , fname . c_str ( ) ) ;
2023-11-06 22:36:23 +01:00
if ( ! embed ) {
2024-04-29 07:34:24 -07:00
fprintf ( stderr , " %s: is %s really an image file? \n " , __func__ , fname . c_str ( ) ) ;
2023-11-06 22:36:23 +01:00
return NULL ;
}
}
return embed ;
}
static void process_prompt ( struct llava_context * ctx_llava , struct llava_image_embed * image_embed , gpt_params * params , const std : : string & prompt ) {
int n_past = 0 ;
const int max_tgt_len = params - > n_predict < 0 ? 256 : params - > n_predict ;
2024-01-27 16:09:18 +01:00
std : : string system_prompt , user_prompt ;
size_t image_pos = prompt . find ( " <image> " ) ;
if ( image_pos ! = std : : string : : npos ) {
// new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
system_prompt = prompt . substr ( 0 , image_pos ) ;
user_prompt = prompt . substr ( image_pos + std : : string ( " <image> " ) . length ( ) ) ;
2024-04-21 08:19:04 -04:00
LOG_TEE ( " system_prompt: %s \n " , system_prompt . c_str ( ) ) ;
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
if ( params - > verbose_prompt ) {
auto tmp = : : llama_tokenize ( ctx_llava - > ctx_llama , system_prompt , true , true ) ;
for ( int i = 0 ; i < ( int ) tmp . size ( ) ; i + + ) {
2024-04-21 08:19:04 -04:00
LOG_TEE ( " %6d -> '%s' \n " , tmp [ i ] , llama_token_to_piece ( ctx_llava - > ctx_llama , tmp [ i ] ) . c_str ( ) ) ;
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
}
}
2024-04-21 08:19:04 -04:00
LOG_TEE ( " user_prompt: %s \n " , user_prompt . c_str ( ) ) ;
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
if ( params - > verbose_prompt ) {
auto tmp = : : llama_tokenize ( ctx_llava - > ctx_llama , user_prompt , true , true ) ;
for ( int i = 0 ; i < ( int ) tmp . size ( ) ; i + + ) {
2024-04-21 08:19:04 -04:00
LOG_TEE ( " %6d -> '%s' \n " , tmp [ i ] , llama_token_to_piece ( ctx_llava - > ctx_llama , tmp [ i ] ) . c_str ( ) ) ;
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
}
}
2024-01-27 16:09:18 +01:00
} else {
// llava-1.5 native mode
system_prompt = " A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. \n USER: " ;
user_prompt = prompt + " \n ASSISTANT: " ;
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
if ( params - > verbose_prompt ) {
auto tmp = : : llama_tokenize ( ctx_llava - > ctx_llama , user_prompt , true , true ) ;
for ( int i = 0 ; i < ( int ) tmp . size ( ) ; i + + ) {
2024-04-21 08:19:04 -04:00
LOG_TEE ( " %6d -> '%s' \n " , tmp [ i ] , llama_token_to_piece ( ctx_llava - > ctx_llama , tmp [ i ] ) . c_str ( ) ) ;
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
}
}
2024-01-27 16:09:18 +01:00
}
2024-04-09 13:44:08 -04:00
eval_string ( ctx_llava - > ctx_llama , system_prompt . c_str ( ) , params - > n_batch , & n_past , true ) ;
2023-11-06 22:36:23 +01:00
llava_eval_image_embed ( ctx_llava - > ctx_llama , image_embed , params - > n_batch , & n_past ) ;
2024-01-27 16:09:18 +01:00
eval_string ( ctx_llava - > ctx_llama , user_prompt . c_str ( ) , params - > n_batch , & n_past , false ) ;
2023-11-06 22:36:23 +01:00
// generate the response
2024-04-21 08:19:04 -04:00
LOG_TEE ( " \n " ) ;
2023-12-29 06:38:38 -08:00
struct llama_sampling_context * ctx_sampling = llama_sampling_init ( params - > sparams ) ;
2024-05-10 07:01:08 -04:00
if ( ! ctx_sampling ) {
fprintf ( stderr , " %s: failed to initialize sampling subsystem \n " , __func__ ) ;
exit ( 1 ) ;
}
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
std : : string response = " " ;
2023-11-06 22:36:23 +01:00
for ( int i = 0 ; i < max_tgt_len ; i + + ) {
2023-12-29 06:38:38 -08:00
const char * tmp = sample ( ctx_sampling , ctx_llava - > ctx_llama , & n_past ) ;
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
response + = tmp ;
2023-11-06 22:36:23 +01:00
if ( strcmp ( tmp , " </s> " ) = = 0 ) break ;
2024-01-27 16:09:18 +01:00
if ( strstr ( tmp , " ### " ) ) break ; // Yi-VL behavior
2023-11-06 22:36:23 +01:00
printf ( " %s " , tmp ) ;
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
if ( strstr ( response . c_str ( ) , " <|im_end|> " ) ) break ; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
if ( strstr ( response . c_str ( ) , " <|im_start|> " ) ) break ; // Yi-34B llava-1.6
if ( strstr ( response . c_str ( ) , " USER: " ) ) break ; // mistral llava-1.6
2023-11-06 22:36:23 +01:00
fflush ( stdout ) ;
}
2023-12-29 06:38:38 -08:00
llama_sampling_free ( ctx_sampling ) ;
2023-11-06 22:36:23 +01:00
printf ( " \n " ) ;
}
2024-04-29 07:34:24 -07:00
static struct llama_model * llava_init ( gpt_params * params ) {
2024-02-16 01:31:07 -08:00
llama_backend_init ( ) ;
llama_numa_init ( params - > numa ) ;
2023-11-06 22:36:23 +01:00
2023-11-06 23:43:59 -08:00
llama_model_params model_params = llama_model_params_from_gpt_params ( * params ) ;
2023-11-06 22:36:23 +01:00
llama_model * model = llama_load_model_from_file ( params - > model . c_str ( ) , model_params ) ;
if ( model = = NULL ) {
2024-04-21 08:19:04 -04:00
LOG_TEE ( " %s: error: unable to load model \n " , __func__ ) ;
2023-11-06 22:36:23 +01:00
return NULL ;
}
2024-04-29 07:34:24 -07:00
return model ;
}
static struct llava_context * llava_init_context ( gpt_params * params , llama_model * model ) {
const char * clip_path = params - > mmproj . c_str ( ) ;
auto prompt = params - > prompt ;
if ( prompt . empty ( ) ) {
prompt = " describe the image in detail. " ;
}
auto ctx_clip = clip_model_load ( clip_path , /*verbosity=*/ 1 ) ;
2023-11-06 22:36:23 +01:00
2023-11-06 23:43:59 -08:00
llama_context_params ctx_params = llama_context_params_from_gpt_params ( * params ) ;
2023-11-06 22:36:23 +01:00
ctx_params . n_ctx = params - > n_ctx < 2048 ? 2048 : params - > n_ctx ; // we need a longer context size to process image embeddings
llama_context * ctx_llama = llama_new_context_with_model ( model , ctx_params ) ;
if ( ctx_llama = = NULL ) {
2024-04-21 08:19:04 -04:00
LOG_TEE ( " %s: error: failed to create the llama_context \n " , __func__ ) ;
2023-11-06 22:36:23 +01:00
return NULL ;
}
auto ctx_llava = ( struct llava_context * ) malloc ( sizeof ( llava_context ) ) ;
ctx_llava - > ctx_llama = ctx_llama ;
ctx_llava - > ctx_clip = ctx_clip ;
ctx_llava - > model = model ;
return ctx_llava ;
}
static void llava_free ( struct llava_context * ctx_llava ) {
if ( ctx_llava - > ctx_clip ) {
clip_free ( ctx_llava - > ctx_clip ) ;
ctx_llava - > ctx_clip = NULL ;
}
llama_free ( ctx_llava - > ctx_llama ) ;
llama_free_model ( ctx_llava - > model ) ;
llama_backend_free ( ) ;
}
2024-04-21 08:19:04 -04:00
static void llama_log_callback_logTee ( ggml_log_level level , const char * text , void * user_data ) {
( void ) level ;
( void ) user_data ;
LOG_TEE ( " %s " , text ) ;
}
2023-11-06 22:36:23 +01:00
int main ( int argc , char * * argv ) {
ggml_time_init ( ) ;
gpt_params params ;
if ( ! gpt_params_parse ( argc , argv , params ) ) {
show_additional_info ( argc , argv ) ;
return 1 ;
}
2024-04-21 08:19:04 -04:00
# ifndef LOG_DISABLE_LOGS
log_set_target ( log_filename_generator ( " llava " , " log " ) ) ;
LOG_TEE ( " Log start \n " ) ;
log_dump_cmdline ( argc , argv ) ;
llama_log_set ( llama_log_callback_logTee , nullptr ) ;
# endif // LOG_DISABLE_LOGS
2023-11-06 22:36:23 +01:00
if ( params . mmproj . empty ( ) | | ( params . image . empty ( ) & & ! prompt_contains_image ( params . prompt ) ) ) {
gpt_print_usage ( argc , argv , params ) ;
show_additional_info ( argc , argv ) ;
return 1 ;
}
2024-04-29 07:34:24 -07:00
auto model = llava_init ( & params ) ;
if ( model = = NULL ) {
fprintf ( stderr , " %s: error: failed to init llava model \n " , __func__ ) ;
2023-11-06 22:36:23 +01:00
return 1 ;
}
2024-04-29 07:34:24 -07:00
for ( auto & image : params . image ) {
auto ctx_llava = llava_init_context ( & params , model ) ;
2023-11-06 22:36:23 +01:00
2024-04-29 07:34:24 -07:00
auto image_embed = load_image ( ctx_llava , & params , image ) ;
if ( ! image_embed ) {
std : : cerr < < " error: failed to load image " < < image < < " . Terminating \n \n " ;
return 1 ;
}
// process the prompt
process_prompt ( ctx_llava , image_embed , & params , params . prompt ) ;
2023-11-06 22:36:23 +01:00
2024-04-29 07:34:24 -07:00
llama_print_timings ( ctx_llava - > ctx_llama ) ;
llava_image_embed_free ( image_embed ) ;
ctx_llava - > model = NULL ;
llava_free ( ctx_llava ) ;
}
llama_free_model ( model ) ;
2023-11-06 22:36:23 +01:00
return 0 ;
}