2023-10-12 17:23:18 +02:00
// NOTE: This is modified from clip.cpp only for LLaVA,
// so there might be still unnecessary artifacts hanging around
// I'll gradually clean and extend it
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
2023-10-12 17:23:18 +02:00
# include "clip.h"
# include "ggml.h"
# include "ggml-alloc.h"
2023-12-29 17:52:15 +01:00
# include "ggml-backend.h"
# ifdef GGML_USE_CUBLAS
# include "ggml-cuda.h"
# endif
# ifdef GGML_USE_METAL
# include "ggml-metal.h"
# endif
2023-10-12 17:23:18 +02:00
# define STB_IMAGE_IMPLEMENTATION
# include "stb_image.h"
2024-01-23 13:12:57 +01:00
# include <cassert>
# include <cmath>
# include <cstdlib>
# include <cstring>
# include <fstream>
# include <iostream>
# include <map>
# include <regex>
# include <stdexcept>
# include <vector>
# include <sstream>
# include <cinttypes>
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
# include <limits>
//#define CLIP_DEBUG_FUNCTIONS
// RGB uint8 image
struct clip_image_u8 {
int nx ;
int ny ;
std : : vector < uint8_t > buf ;
} ;
// RGB float32 image (NHWC)
// Memory layout: RGBRGBRGB...
struct clip_image_f32 {
int nx ;
int ny ;
std : : vector < float > buf ;
} ;
2024-01-23 13:12:57 +01:00
2023-10-12 17:23:18 +02:00
static std : : string format ( const char * fmt , . . . ) {
va_list ap ;
va_list ap2 ;
va_start ( ap , fmt ) ;
va_copy ( ap2 , ap ) ;
int size = vsnprintf ( NULL , 0 , fmt , ap ) ;
GGML_ASSERT ( size > = 0 & & size < INT_MAX ) ; // NOLINT
std : : vector < char > buf ( size + 1 ) ;
int size2 = vsnprintf ( buf . data ( ) , size + 1 , fmt , ap2 ) ;
GGML_ASSERT ( size2 = = size ) ;
va_end ( ap2 ) ;
va_end ( ap ) ;
return std : : string ( buf . data ( ) , buf . size ( ) ) ;
}
//
// key constants
//
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
# define KEY_FTYPE "general.file_type"
# define KEY_NAME "general.name"
# define KEY_DESCRIPTION "general.description"
# define KEY_HAS_TEXT_ENC "clip.has_text_encoder"
# define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
2023-10-12 17:23:18 +02:00
# define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
# define KEY_USE_GELU "clip.use_gelu"
# define KEY_N_EMBD "clip.%s.embedding_length"
# define KEY_N_FF "clip.%s.feed_forward_length"
# define KEY_N_BLOCK "clip.%s.block_count"
# define KEY_N_HEAD "clip.%s.attention.head_count"
2023-10-12 17:23:18 +02:00
# define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
# define KEY_PROJ_DIM "clip.%s.projection_dim"
# define KEY_TOKENS "tokenizer.ggml.tokens"
# define KEY_N_POSITIONS "clip.text.context_length"
# define KEY_IMAGE_SIZE "clip.vision.image_size"
# define KEY_PATCH_SIZE "clip.vision.patch_size"
# define KEY_IMAGE_MEAN "clip.vision.image_mean"
# define KEY_IMAGE_STD "clip.vision.image_std"
# define KEY_PROJ_TYPE "clip.projector_type"
# define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
# define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
# define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
2023-10-12 17:23:18 +02:00
//
// tensor name constants
//
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
# define TN_TOKEN_EMBD "%s.token_embd.weight"
# define TN_POS_EMBD "%s.position_embd.weight"
# define TN_CLASS_EMBD "v.class_embd"
# define TN_PATCH_EMBD "v.patch_embd.weight"
# define TN_ATTN_K "%s.blk.%d.attn_k.%s"
# define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
# define TN_ATTN_V "%s.blk.%d.attn_v.%s"
# define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s"
# define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s"
# define TN_FFN_UP "%s.blk.%d.ffn_up.%s"
# define TN_LN_1 "%s.blk.%d.ln1.%s"
# define TN_LN_2 "%s.blk.%d.ln2.%s"
# define TN_LN_PRE "%s.pre_ln.%s"
# define TN_LN_POST "%s.post_ln.%s"
# define TN_TEXT_PROJ "text_projection.weight"
# define TN_VIS_PROJ "visual_projection.weight"
# define TN_LLAVA_PROJ "mm.%d.%s"
# define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
2024-01-22 14:09:35 +01:00
# define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
# define TN_IMAGE_NEWLINE "model.image_newline"
2024-01-22 14:09:35 +01:00
enum projector_type {
PROJECTOR_TYPE_MLP ,
2024-01-27 16:09:18 +01:00
PROJECTOR_TYPE_MLP_NORM ,
2024-01-22 14:09:35 +01:00
PROJECTOR_TYPE_LDP ,
PROJECTOR_TYPE_UNKNOWN ,
} ;
static std : : map < projector_type , std : : string > PROJECTOR_TYPE_NAMES = {
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
{ PROJECTOR_TYPE_MLP , " mlp " } ,
{ PROJECTOR_TYPE_LDP , " ldp " } ,
2024-01-22 14:09:35 +01:00
} ;
2023-10-12 17:23:18 +02:00
//
// utilities to get data from a gguf file
//
static int get_key_idx ( const gguf_context * ctx , const char * key ) {
int i = gguf_find_key ( ctx , key ) ;
if ( i = = - 1 ) {
fprintf ( stderr , " key %s not found in file \n " , key ) ;
throw std : : runtime_error ( format ( " Missing required key: %s " , key ) ) ;
}
return i ;
}
static uint32_t get_u32 ( const gguf_context * ctx , const std : : string & key ) {
const int i = get_key_idx ( ctx , key . c_str ( ) ) ;
return gguf_get_val_u32 ( ctx , i ) ;
}
static float get_f32 ( const gguf_context * ctx , const std : : string & key ) {
const int i = get_key_idx ( ctx , key . c_str ( ) ) ;
return gguf_get_val_f32 ( ctx , i ) ;
}
static struct ggml_tensor * get_tensor ( struct ggml_context * ctx , const std : : string & name ) {
struct ggml_tensor * cur = ggml_get_tensor ( ctx , name . c_str ( ) ) ;
if ( ! cur ) {
2023-10-19 15:59:11 +02:00
throw std : : runtime_error ( format ( " %s: unable to find tensor %s \n " , __func__ , name . c_str ( ) ) ) ;
2023-10-12 17:23:18 +02:00
}
return cur ;
}
static std : : string get_ftype ( int ftype ) {
2024-01-10 14:37:09 +01:00
return ggml_type_name ( static_cast < ggml_type > ( ftype ) ) ;
2023-10-12 17:23:18 +02:00
}
2024-01-22 14:09:35 +01:00
static std : : string gguf_data_to_str ( enum gguf_type type , const void * data , int i ) {
switch ( type ) {
case GGUF_TYPE_UINT8 : return std : : to_string ( ( ( const uint8_t * ) data ) [ i ] ) ;
case GGUF_TYPE_INT8 : return std : : to_string ( ( ( const int8_t * ) data ) [ i ] ) ;
case GGUF_TYPE_UINT16 : return std : : to_string ( ( ( const uint16_t * ) data ) [ i ] ) ;
case GGUF_TYPE_INT16 : return std : : to_string ( ( ( const int16_t * ) data ) [ i ] ) ;
case GGUF_TYPE_UINT32 : return std : : to_string ( ( ( const uint32_t * ) data ) [ i ] ) ;
case GGUF_TYPE_INT32 : return std : : to_string ( ( ( const int32_t * ) data ) [ i ] ) ;
case GGUF_TYPE_UINT64 : return std : : to_string ( ( ( const uint64_t * ) data ) [ i ] ) ;
case GGUF_TYPE_INT64 : return std : : to_string ( ( ( const int64_t * ) data ) [ i ] ) ;
case GGUF_TYPE_FLOAT32 : return std : : to_string ( ( ( const float * ) data ) [ i ] ) ;
case GGUF_TYPE_FLOAT64 : return std : : to_string ( ( ( const double * ) data ) [ i ] ) ;
case GGUF_TYPE_BOOL : return ( ( const bool * ) data ) [ i ] ? " true " : " false " ;
default : return format ( " unknown type %d " , type ) ;
}
}
static void replace_all ( std : : string & s , const std : : string & search , const std : : string & replace ) {
std : : string result ;
for ( size_t pos = 0 ; ; pos + = search . length ( ) ) {
auto new_pos = s . find ( search , pos ) ;
if ( new_pos = = std : : string : : npos ) {
result + = s . substr ( pos , s . size ( ) - pos ) ;
break ;
}
result + = s . substr ( pos , new_pos - pos ) + replace ;
pos = new_pos ;
}
s = std : : move ( result ) ;
}
static std : : string gguf_kv_to_str ( const struct gguf_context * ctx_gguf , int i ) {
const enum gguf_type type = gguf_get_kv_type ( ctx_gguf , i ) ;
switch ( type ) {
case GGUF_TYPE_STRING :
return gguf_get_val_str ( ctx_gguf , i ) ;
case GGUF_TYPE_ARRAY :
{
const enum gguf_type arr_type = gguf_get_arr_type ( ctx_gguf , i ) ;
int arr_n = gguf_get_arr_n ( ctx_gguf , i ) ;
const void * data = gguf_get_arr_data ( ctx_gguf , i ) ;
std : : stringstream ss ;
ss < < " [ " ;
for ( int j = 0 ; j < arr_n ; j + + ) {
if ( arr_type = = GGUF_TYPE_STRING ) {
std : : string val = gguf_get_arr_str ( ctx_gguf , i , j ) ;
// escape quotes
replace_all ( val , " \\ " , " \\ \\ " ) ;
replace_all ( val , " \" " , " \\ \" " ) ;
ss < < ' " ' < < val < < ' " ' ;
} else if ( arr_type = = GGUF_TYPE_ARRAY ) {
ss < < " ??? " ;
} else {
ss < < gguf_data_to_str ( arr_type , data , j ) ;
}
if ( j < arr_n - 1 ) {
ss < < " , " ;
}
}
ss < < " ] " ;
return ss . str ( ) ;
}
default :
return gguf_data_to_str ( type , gguf_get_val_data ( ctx_gguf , i ) , 0 ) ;
}
}
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
static void print_tensor_info ( const ggml_tensor * tensor , const char * prefix = " " ) {
2024-01-22 14:09:35 +01:00
size_t tensor_size = ggml_nbytes ( tensor ) ;
2024-01-23 13:12:57 +01:00
printf ( " %s: n_dims = %d, name = %s, tensor_size=%zu, shape:[% " PRId64 " , % " PRId64 " , % " PRId64 " , % " PRId64 " ], type = %s \n " ,
2024-01-22 14:09:35 +01:00
prefix , ggml_n_dims ( tensor ) , tensor - > name , tensor_size ,
2024-01-23 13:12:57 +01:00
tensor - > ne [ 0 ] , tensor - > ne [ 1 ] , tensor - > ne [ 2 ] , tensor - > ne [ 3 ] , ggml_type_name ( tensor - > type ) ) ;
2024-01-22 14:09:35 +01:00
}
static projector_type clip_projector_type_from_string ( const std : : string & name ) {
for ( const auto & kv : PROJECTOR_TYPE_NAMES ) { // NOLINT
if ( kv . second = = name ) {
return kv . first ;
}
}
return PROJECTOR_TYPE_UNKNOWN ;
}
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
# ifdef CLIP_DEBUG_FUNCTIONS
static void clip_image_write_image_to_ppm ( const clip_image_u8 & img , const std : : string & filename ) {
std : : ofstream file ( filename , std : : ios : : binary ) ;
if ( ! file . is_open ( ) ) {
std : : cerr < < " Failed to open file for writing: " < < filename < < std : : endl ;
return ;
}
2023-12-30 22:24:42 +01:00
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
// PPM header: P6 format, width, height, and max color value
file < < " P6 \n " < < img . nx < < " " < < img . ny < < " \n 255 \n " ;
2023-12-30 22:24:42 +01:00
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
// Write pixel data
for ( size_t i = 0 ; i < img . buf . size ( ) ; i + = 3 ) {
// PPM expects binary data in RGB format, which matches our image buffer
file . write ( reinterpret_cast < const char * > ( & img . buf [ i ] ) , 3 ) ;
}
2023-12-30 22:24:42 +01:00
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
file . close ( ) ;
}
static void clip_image_save_to_bmp ( const clip_image_u8 & img , const std : : string & filename ) {
std : : ofstream file ( filename , std : : ios : : binary ) ;
if ( ! file . is_open ( ) ) {
std : : cerr < < " Failed to open file for writing: " < < filename < < std : : endl ;
return ;
}
int fileSize = 54 + 3 * img . nx * img . ny ; // File header + info header + pixel data
int bytesPerPixel = 3 ;
int widthInBytes = img . nx * bytesPerPixel ;
int paddingAmount = ( 4 - ( widthInBytes % 4 ) ) % 4 ;
int stride = widthInBytes + paddingAmount ;
// Bitmap file header
unsigned char fileHeader [ 14 ] = {
' B ' , ' M ' , // Signature
0 , 0 , 0 , 0 , // Image file size in bytes
0 , 0 , 0 , 0 , // Reserved
54 , 0 , 0 , 0 // Start of pixel array
} ;
// Total file size
fileSize = 54 + ( stride * img . ny ) ;
fileHeader [ 2 ] = ( unsigned char ) ( fileSize ) ;
fileHeader [ 3 ] = ( unsigned char ) ( fileSize > > 8 ) ;
fileHeader [ 4 ] = ( unsigned char ) ( fileSize > > 16 ) ;
fileHeader [ 5 ] = ( unsigned char ) ( fileSize > > 24 ) ;
// Bitmap information header (BITMAPINFOHEADER)
unsigned char infoHeader [ 40 ] = {
40 , 0 , 0 , 0 , // Size of this header (40 bytes)
0 , 0 , 0 , 0 , // Image width
0 , 0 , 0 , 0 , // Image height
1 , 0 , // Number of color planes
24 , 0 , // Bits per pixel
0 , 0 , 0 , 0 , // No compression
0 , 0 , 0 , 0 , // Image size (can be 0 for no compression)
0 , 0 , 0 , 0 , // X pixels per meter (not specified)
0 , 0 , 0 , 0 , // Y pixels per meter (not specified)
0 , 0 , 0 , 0 , // Total colors (color table not used)
0 , 0 , 0 , 0 // Important colors (all are important)
} ;
// Width and height in the information header
infoHeader [ 4 ] = ( unsigned char ) ( img . nx ) ;
infoHeader [ 5 ] = ( unsigned char ) ( img . nx > > 8 ) ;
infoHeader [ 6 ] = ( unsigned char ) ( img . nx > > 16 ) ;
infoHeader [ 7 ] = ( unsigned char ) ( img . nx > > 24 ) ;
infoHeader [ 8 ] = ( unsigned char ) ( img . ny ) ;
infoHeader [ 9 ] = ( unsigned char ) ( img . ny > > 8 ) ;
infoHeader [ 10 ] = ( unsigned char ) ( img . ny > > 16 ) ;
infoHeader [ 11 ] = ( unsigned char ) ( img . ny > > 24 ) ;
// Write file headers
file . write ( reinterpret_cast < char * > ( fileHeader ) , sizeof ( fileHeader ) ) ;
file . write ( reinterpret_cast < char * > ( infoHeader ) , sizeof ( infoHeader ) ) ;
// Pixel data
std : : vector < unsigned char > padding ( 3 , 0 ) ; // Max padding size to be added to each row
for ( int y = img . ny - 1 ; y > = 0 ; - - y ) { // BMP files are stored bottom-to-top
for ( int x = 0 ; x < img . nx ; + + x ) {
// Each pixel
size_t pixelIndex = ( y * img . nx + x ) * 3 ;
unsigned char pixel [ 3 ] = {
img . buf [ pixelIndex + 2 ] , // BMP stores pixels in BGR format
img . buf [ pixelIndex + 1 ] ,
img . buf [ pixelIndex ]
} ;
file . write ( reinterpret_cast < char * > ( pixel ) , 3 ) ;
}
// Write padding for the row
file . write ( reinterpret_cast < char * > ( padding . data ( ) ) , paddingAmount ) ;
}
file . close ( ) ;
}
// debug function to convert f32 to u8
static void clip_image_convert_f32_to_u8 ( const clip_image_f32 & src , clip_image_u8 & dst ) {
dst . nx = src . nx ;
dst . ny = src . ny ;
dst . buf . resize ( 3 * src . nx * src . ny ) ;
for ( size_t i = 0 ; i < src . buf . size ( ) ; + + i ) {
dst . buf [ i ] = static_cast < uint8_t > ( std : : min ( std : : max ( int ( src . buf [ i ] * 255.0f ) , 0 ) , 255 ) ) ;
}
}
# endif
2023-12-30 22:24:42 +01:00
2023-10-12 17:23:18 +02:00
//
// clip layers
//
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
struct clip_hparams {
int32_t image_size ;
int32_t patch_size ;
int32_t hidden_size ;
int32_t n_intermediate ;
int32_t projection_dim ;
int32_t n_head ;
int32_t n_layer ;
float eps ;
char mm_patch_merge_type [ 32 ] = " flat " ; // spatial_unpad or flat (default)
int32_t image_grid_pinpoints [ 32 ] ;
int32_t image_crop_resolution ;
} ;
2023-10-12 17:23:18 +02:00
struct clip_layer {
// attention
struct ggml_tensor * k_w ;
struct ggml_tensor * k_b ;
struct ggml_tensor * q_w ;
struct ggml_tensor * q_b ;
struct ggml_tensor * v_w ;
struct ggml_tensor * v_b ;
struct ggml_tensor * o_w ;
struct ggml_tensor * o_b ;
// layernorm 1
struct ggml_tensor * ln_1_w ;
struct ggml_tensor * ln_1_b ;
// ff
struct ggml_tensor * ff_i_w ;
struct ggml_tensor * ff_i_b ;
struct ggml_tensor * ff_o_w ;
struct ggml_tensor * ff_o_b ;
// layernorm 2
struct ggml_tensor * ln_2_w ;
struct ggml_tensor * ln_2_b ;
} ;
struct clip_vision_model {
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
struct clip_hparams hparams ;
2023-10-12 17:23:18 +02:00
// embeddings
struct ggml_tensor * class_embedding ;
struct ggml_tensor * patch_embeddings ;
struct ggml_tensor * position_embeddings ;
struct ggml_tensor * pre_ln_w ;
struct ggml_tensor * pre_ln_b ;
std : : vector < clip_layer > layers ;
struct ggml_tensor * post_ln_w ;
struct ggml_tensor * post_ln_b ;
struct ggml_tensor * projection ;
// LLaVA projection
2024-01-27 16:09:18 +01:00
struct ggml_tensor * mm_0_w = NULL ;
struct ggml_tensor * mm_0_b = NULL ;
struct ggml_tensor * mm_2_w = NULL ;
struct ggml_tensor * mm_2_b = NULL ;
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
struct ggml_tensor * image_newline = NULL ;
2024-01-27 16:09:18 +01:00
// Yi type models with mlp+normalization projection
struct ggml_tensor * mm_1_w = NULL ; // Yi type models have 0, 1, 3, 4
struct ggml_tensor * mm_1_b = NULL ;
struct ggml_tensor * mm_3_w = NULL ;
struct ggml_tensor * mm_3_b = NULL ;
struct ggml_tensor * mm_4_w = NULL ;
struct ggml_tensor * mm_4_b = NULL ;
2024-01-22 14:09:35 +01:00
// MobileVLM projection
struct ggml_tensor * mm_model_mlp_1_w ;
struct ggml_tensor * mm_model_mlp_1_b ;
struct ggml_tensor * mm_model_mlp_3_w ;
struct ggml_tensor * mm_model_mlp_3_b ;
struct ggml_tensor * mm_model_block_1_block_0_0_w ;
struct ggml_tensor * mm_model_block_1_block_0_1_w ;
struct ggml_tensor * mm_model_block_1_block_0_1_b ;
struct ggml_tensor * mm_model_block_1_block_1_fc1_w ;
struct ggml_tensor * mm_model_block_1_block_1_fc1_b ;
struct ggml_tensor * mm_model_block_1_block_1_fc2_w ;
struct ggml_tensor * mm_model_block_1_block_1_fc2_b ;
struct ggml_tensor * mm_model_block_1_block_2_0_w ;
struct ggml_tensor * mm_model_block_1_block_2_1_w ;
struct ggml_tensor * mm_model_block_1_block_2_1_b ;
struct ggml_tensor * mm_model_block_2_block_0_0_w ;
struct ggml_tensor * mm_model_block_2_block_0_1_w ;
struct ggml_tensor * mm_model_block_2_block_0_1_b ;
struct ggml_tensor * mm_model_block_2_block_1_fc1_w ;
struct ggml_tensor * mm_model_block_2_block_1_fc1_b ;
struct ggml_tensor * mm_model_block_2_block_1_fc2_w ;
struct ggml_tensor * mm_model_block_2_block_1_fc2_b ;
struct ggml_tensor * mm_model_block_2_block_2_0_w ;
struct ggml_tensor * mm_model_block_2_block_2_1_w ;
struct ggml_tensor * mm_model_block_2_block_2_1_b ;
2023-10-12 17:23:18 +02:00
} ;
struct clip_ctx {
2023-12-30 22:24:42 +01:00
bool has_text_encoder = false ;
bool has_vision_encoder = false ;
2023-10-12 17:23:18 +02:00
bool has_llava_projector = false ;
2023-12-30 22:24:42 +01:00
2023-10-12 17:23:18 +02:00
struct clip_vision_model vision_model ;
2024-01-22 14:09:35 +01:00
projector_type proj_type = PROJECTOR_TYPE_MLP ;
2023-12-30 22:24:42 +01:00
2023-10-12 17:23:18 +02:00
float image_mean [ 3 ] ;
float image_std [ 3 ] ;
bool use_gelu = false ;
int32_t ftype = 1 ;
2023-12-30 22:24:42 +01:00
2023-10-12 17:23:18 +02:00
struct gguf_context * ctx_gguf ;
2023-12-30 22:24:42 +01:00
struct ggml_context * ctx_data ;
std : : vector < uint8_t > buf_compute_meta ;
2023-10-12 17:23:18 +02:00
// memory buffers to evaluate the model
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
ggml_backend_buffer_t params_buffer = NULL ;
2023-12-29 17:52:15 +01:00
ggml_backend_buffer_t compute_buffer = NULL ;
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
ggml_backend_t backend = NULL ;
2024-02-12 08:16:06 +01:00
ggml_gallocr_t compute_alloc = NULL ;
2023-10-12 17:23:18 +02:00
} ;
2023-12-30 22:24:42 +01:00
static ggml_cgraph * clip_image_build_graph ( clip_ctx * ctx , const clip_image_f32_batch * imgs ) {
2023-10-12 17:23:18 +02:00
if ( ! ctx - > has_vision_encoder ) {
printf ( " This gguf file seems to have no vision encoder \n " ) ;
return nullptr ;
}
const auto & model = ctx - > vision_model ;
const auto & hparams = model . hparams ;
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
const int image_size = hparams . image_size ;
const int patch_size = hparams . patch_size ;
const int num_patches = ( ( image_size / patch_size ) * ( image_size / patch_size ) ) ;
const int num_patches_per_side = image_size / patch_size ; GGML_UNUSED ( num_patches_per_side ) ;
const int num_positions = num_patches + 1 ;
const int hidden_size = hparams . hidden_size ;
const int n_head = hparams . n_head ;
const int d_head = hidden_size / n_head ;
const int n_layer = hparams . n_layer ;
const float eps = hparams . eps ;
const int batch_size = imgs - > size ;
2023-12-30 22:24:42 +01:00
if ( ctx - > has_llava_projector ) {
2023-10-12 17:23:18 +02:00
GGML_ASSERT ( batch_size = = 1 ) ;
}
2023-12-30 22:24:42 +01:00
2023-10-12 17:23:18 +02:00
struct ggml_init_params params = {
2023-12-30 22:24:42 +01:00
/*.mem_size =*/ ctx - > buf_compute_meta . size ( ) ,
/*.mem_buffer =*/ ctx - > buf_compute_meta . data ( ) ,
/*.no_alloc =*/ true ,
2023-10-12 17:23:18 +02:00
} ;
struct ggml_context * ctx0 = ggml_init ( params ) ;
struct ggml_cgraph * gf = ggml_new_graph ( ctx0 ) ;
struct ggml_tensor * inp_raw = ggml_new_tensor_4d ( ctx0 , GGML_TYPE_F32 , image_size , image_size , 3 , batch_size ) ;
2024-02-12 08:16:06 +01:00
ggml_set_name ( inp_raw , " inp_raw " ) ;
ggml_set_input ( inp_raw ) ;
2023-10-12 17:23:18 +02:00
struct ggml_tensor * inp = ggml_conv_2d ( ctx0 , model . patch_embeddings , inp_raw , patch_size , patch_size , 0 , 0 , 1 , 1 ) ;
inp = ggml_reshape_3d ( ctx0 , inp , num_patches , hidden_size , batch_size ) ;
inp = ggml_cont ( ctx0 , ggml_permute ( ctx0 , inp , 1 , 0 , 2 , 3 ) ) ;
// concat class_embeddings and patch_embeddings
struct ggml_tensor * embeddings = ggml_new_tensor_3d ( ctx0 , GGML_TYPE_F32 , hidden_size , num_positions , batch_size ) ;
2024-02-12 08:16:06 +01:00
ggml_set_name ( embeddings , " embeddings " ) ;
ggml_set_input ( embeddings ) ;
2023-10-12 17:23:18 +02:00
2023-12-29 17:52:15 +01:00
embeddings = ggml_acc ( ctx0 , embeddings , model . class_embedding ,
embeddings - > nb [ 1 ] , embeddings - > nb [ 2 ] , embeddings - > nb [ 3 ] , 0 ) ;
2023-10-12 17:23:18 +02:00
2023-12-29 17:52:15 +01:00
embeddings = ggml_acc ( ctx0 , embeddings , inp ,
embeddings - > nb [ 1 ] , embeddings - > nb [ 2 ] , embeddings - > nb [ 3 ] , model . class_embedding - > nb [ 1 ] ) ;
2023-10-12 17:23:18 +02:00
struct ggml_tensor * positions = ggml_new_tensor_1d ( ctx0 , GGML_TYPE_I32 , num_positions ) ;
2024-02-12 08:16:06 +01:00
ggml_set_name ( positions , " positions " ) ;
ggml_set_input ( positions ) ;
2023-10-12 17:23:18 +02:00
embeddings =
2023-12-29 17:52:15 +01:00
ggml_add ( ctx0 , embeddings , ggml_get_rows ( ctx0 , model . position_embeddings , positions ) ) ;
2023-10-12 17:23:18 +02:00
// pre-layernorm
{
embeddings = ggml_norm ( ctx0 , embeddings , eps ) ;
2024-01-27 16:09:18 +01:00
ggml_set_name ( embeddings , " pre_ln " ) ;
2023-10-12 17:23:18 +02:00
2023-12-29 17:52:15 +01:00
embeddings = ggml_add ( ctx0 , ggml_mul ( ctx0 , embeddings , model . pre_ln_w ) , model . pre_ln_b ) ;
2023-10-12 17:23:18 +02:00
}
// loop over layers
for ( int il = 0 ; il < n_layer - 1 ; il + + ) {
struct ggml_tensor * cur = embeddings ; // embeddings = residual, cur = hidden_states
//const size_t nb_q_w = model.layers[il].q_w->nb[0];
// layernorm1
{
cur = ggml_norm ( ctx0 , cur , eps ) ;
2023-12-29 17:52:15 +01:00
cur = ggml_add ( ctx0 , ggml_mul ( ctx0 , cur , model . layers [ il ] . ln_1_w ) ,
model . layers [ il ] . ln_1_b ) ;
2023-10-12 17:23:18 +02:00
}
// self-attention
{
struct ggml_tensor * Q =
2023-12-29 17:52:15 +01:00
ggml_add ( ctx0 , ggml_mul_mat ( ctx0 , model . layers [ il ] . q_w , cur ) , model . layers [ il ] . q_b ) ;
2023-10-12 17:23:18 +02:00
2023-12-21 22:20:49 +01:00
Q = ggml_scale_inplace ( ctx0 , Q , 1.0f / sqrt ( ( float ) d_head ) ) ;
2023-10-12 17:23:18 +02:00
Q = ggml_reshape_4d ( ctx0 , Q , d_head , n_head , num_positions , batch_size ) ;
Q = ggml_cont ( ctx0 , ggml_permute ( ctx0 , Q , 0 , 2 , 1 , 3 ) ) ;
Q = ggml_reshape_3d ( ctx0 , Q , d_head , num_positions , n_head * batch_size ) ;
struct ggml_tensor * K =
2023-12-29 17:52:15 +01:00
ggml_add ( ctx0 , ggml_mul_mat ( ctx0 , model . layers [ il ] . k_w , cur ) , model . layers [ il ] . k_b ) ;
2023-10-12 17:23:18 +02:00
K = ggml_reshape_4d ( ctx0 , K , d_head , n_head , num_positions , batch_size ) ;
K = ggml_cont ( ctx0 , ggml_permute ( ctx0 , K , 0 , 2 , 1 , 3 ) ) ;
K = ggml_reshape_3d ( ctx0 , K , d_head , num_positions , n_head * batch_size ) ;
struct ggml_tensor * V =
2023-12-29 17:52:15 +01:00
ggml_add ( ctx0 , ggml_mul_mat ( ctx0 , model . layers [ il ] . v_w , cur ) , model . layers [ il ] . v_b ) ;
2023-10-12 17:23:18 +02:00
V = ggml_reshape_4d ( ctx0 , V , d_head , n_head , num_positions , batch_size ) ;
V = ggml_cont ( ctx0 , ggml_permute ( ctx0 , V , 1 , 2 , 0 , 3 ) ) ;
V = ggml_reshape_3d ( ctx0 , V , num_positions , d_head , n_head * batch_size ) ;
struct ggml_tensor * KQ = ggml_mul_mat ( ctx0 , K , Q ) ;
KQ = ggml_soft_max_inplace ( ctx0 , KQ ) ;
struct ggml_tensor * KQV = ggml_mul_mat ( ctx0 , V , KQ ) ;
KQV = ggml_reshape_4d ( ctx0 , KQV , d_head , num_positions , n_head , batch_size ) ;
2024-02-19 14:23:17 +01:00
KQV = ggml_permute ( ctx0 , KQV , 0 , 2 , 1 , 3 ) ;
2023-10-12 17:23:18 +02:00
2024-02-19 14:02:36 +01:00
cur = ggml_cont_3d ( ctx0 , KQV , hidden_size , num_positions , batch_size ) ;
2023-10-12 17:23:18 +02:00
}
// attention output
2023-12-29 17:52:15 +01:00
cur = ggml_add ( ctx0 , ggml_mul_mat ( ctx0 , model . layers [ il ] . o_w , cur ) , model . layers [ il ] . o_b ) ;
2023-10-12 17:23:18 +02:00
// re-add the layer input, e.g., residual
cur = ggml_add ( ctx0 , cur , embeddings ) ;
embeddings = cur ; // embeddings = residual, cur = hidden_states
// layernorm2
{
cur = ggml_norm ( ctx0 , cur , eps ) ;
2023-12-29 17:52:15 +01:00
cur = ggml_add ( ctx0 , ggml_mul ( ctx0 , cur , model . layers [ il ] . ln_2_w ) , model . layers [ il ] . ln_2_b ) ;
2023-10-12 17:23:18 +02:00
}
cur = ggml_mul_mat ( ctx0 , model . layers [ il ] . ff_i_w , cur ) ;
2023-12-29 17:52:15 +01:00
cur = ggml_add ( ctx0 , cur , model . layers [ il ] . ff_i_b ) ;
2023-10-12 17:23:18 +02:00
if ( ctx - > use_gelu ) {
cur = ggml_gelu_inplace ( ctx0 , cur ) ;
} else {
cur = ggml_gelu_quick_inplace ( ctx0 , cur ) ;
}
cur = ggml_mul_mat ( ctx0 , model . layers [ il ] . ff_o_w , cur ) ;
2023-12-29 17:52:15 +01:00
cur = ggml_add ( ctx0 , cur , model . layers [ il ] . ff_o_b ) ;
2023-10-12 17:23:18 +02:00
// residual 2
cur = ggml_add ( ctx0 , embeddings , cur ) ;
embeddings = cur ;
}
// llava projector
{
embeddings = ggml_reshape_2d ( ctx0 , embeddings , embeddings - > ne [ 0 ] , embeddings - > ne [ 1 ] ) ;
struct ggml_tensor * patches = ggml_new_tensor_1d ( ctx0 , GGML_TYPE_I32 , num_patches ) ;
2024-02-12 08:16:06 +01:00
ggml_set_name ( patches , " patches " ) ;
ggml_set_input ( patches ) ;
2023-10-12 17:23:18 +02:00
2024-01-22 14:09:35 +01:00
// shape [1, 576, 1024]
// ne is whcn, ne = [1024, 576, 1, 1]
2023-10-12 17:23:18 +02:00
embeddings = ggml_get_rows ( ctx0 , embeddings , patches ) ;
2024-01-22 14:09:35 +01:00
// print_tensor_info(embeddings, "embeddings");
// llava projector
if ( ctx - > proj_type = = PROJECTOR_TYPE_MLP ) {
embeddings = ggml_mul_mat ( ctx0 , model . mm_0_w , embeddings ) ;
embeddings = ggml_add ( ctx0 , embeddings , model . mm_0_b ) ;
embeddings = ggml_gelu ( ctx0 , embeddings ) ;
embeddings = ggml_mul_mat ( ctx0 , model . mm_2_w , embeddings ) ;
embeddings = ggml_add ( ctx0 , embeddings , model . mm_2_b ) ;
2024-01-27 16:09:18 +01:00
} else if ( ctx - > proj_type = = PROJECTOR_TYPE_MLP_NORM ) {
embeddings = ggml_mul_mat ( ctx0 , model . mm_0_w , embeddings ) ;
embeddings = ggml_add ( ctx0 , embeddings , model . mm_0_b ) ;
// ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
// First LayerNorm
embeddings = ggml_norm ( ctx0 , embeddings , eps ) ;
embeddings = ggml_add ( ctx0 , ggml_mul ( ctx0 , embeddings , model . mm_1_w ) ,
model . mm_1_b ) ;
// GELU activation
embeddings = ggml_gelu ( ctx0 , embeddings ) ;
// Second linear layer
embeddings = ggml_mul_mat ( ctx0 , model . mm_3_w , embeddings ) ;
embeddings = ggml_add ( ctx0 , embeddings , model . mm_3_b ) ;
// Second LayerNorm
embeddings = ggml_norm ( ctx0 , embeddings , eps ) ;
embeddings = ggml_add ( ctx0 , ggml_mul ( ctx0 , embeddings , model . mm_4_w ) ,
model . mm_4_b ) ;
2024-01-22 14:09:35 +01:00
}
else if ( ctx - > proj_type = = PROJECTOR_TYPE_LDP ) {
// MobileVLM projector
int n_patch = 24 ;
struct ggml_tensor * mlp_1 = ggml_mul_mat ( ctx0 , model . mm_model_mlp_1_w , embeddings ) ;
mlp_1 = ggml_add ( ctx0 , mlp_1 , model . mm_model_mlp_1_b ) ;
mlp_1 = ggml_gelu ( ctx0 , mlp_1 ) ;
struct ggml_tensor * mlp_3 = ggml_mul_mat ( ctx0 , model . mm_model_mlp_3_w , mlp_1 ) ;
mlp_3 = ggml_add ( ctx0 , mlp_3 , model . mm_model_mlp_3_b ) ;
// mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
// block 1
struct ggml_tensor * block_1 = nullptr ;
{
// transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
mlp_3 = ggml_cont ( ctx0 , ggml_permute ( ctx0 , mlp_3 , 1 , 0 , 2 , 3 ) ) ;
mlp_3 = ggml_reshape_4d ( ctx0 , mlp_3 , n_patch , n_patch , mlp_3 - > ne [ 1 ] , mlp_3 - > ne [ 2 ] ) ;
// stride = 1, padding = 1, bias is nullptr
2024-01-23 13:12:57 +01:00
block_1 = ggml_conv_depthwise_2d ( ctx0 , model . mm_model_block_1_block_0_0_w , mlp_3 , 1 , 1 , 1 , 1 , 1 , 1 ) ;
2024-01-22 14:09:35 +01:00
// layer norm
// // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
block_1 = ggml_cont ( ctx0 , ggml_permute ( ctx0 , block_1 , 1 , 2 , 0 , 3 ) ) ;
// block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
block_1 = ggml_norm ( ctx0 , block_1 , eps ) ;
block_1 = ggml_add ( ctx0 , ggml_mul ( ctx0 , block_1 , model . mm_model_block_1_block_0_1_w ) , model . mm_model_block_1_block_0_1_b ) ;
block_1 = ggml_cont ( ctx0 , ggml_permute ( ctx0 , block_1 , 2 , 0 , 1 , 3 ) ) ;
// block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
// hardswish
struct ggml_tensor * block_1_hw = ggml_hardswish ( ctx0 , block_1 ) ;
block_1 = ggml_pool_2d ( ctx0 , block_1_hw , GGML_OP_POOL_AVG , block_1_hw - > ne [ 0 ] , block_1_hw - > ne [ 1 ] , block_1_hw - > ne [ 0 ] , block_1_hw - > ne [ 1 ] , 0 , 0 ) ;
// block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
// pointwise conv
block_1 = ggml_reshape_2d ( ctx0 , block_1 , block_1 - > ne [ 0 ] * block_1 - > ne [ 1 ] * block_1 - > ne [ 2 ] , block_1 - > ne [ 3 ] ) ;
block_1 = ggml_mul_mat ( ctx0 , model . mm_model_block_1_block_1_fc1_w , block_1 ) ;
block_1 = ggml_add ( ctx0 , block_1 , model . mm_model_block_1_block_1_fc1_b ) ;
block_1 = ggml_relu ( ctx0 , block_1 ) ;
block_1 = ggml_mul_mat ( ctx0 , model . mm_model_block_1_block_1_fc2_w , block_1 ) ;
block_1 = ggml_add ( ctx0 , block_1 , model . mm_model_block_1_block_1_fc2_b ) ;
block_1 = ggml_hardsigmoid ( ctx0 , block_1 ) ;
// block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1]
block_1 = ggml_reshape_4d ( ctx0 , block_1 , 1 , 1 , block_1 - > ne [ 0 ] , block_1 - > ne [ 1 ] ) ;
block_1 = ggml_mul ( ctx0 , block_1_hw , block_1 ) ;
int w = block_1 - > ne [ 0 ] , h = block_1 - > ne [ 1 ] ;
block_1 = ggml_reshape_3d ( ctx0 , block_1 , w * h , block_1 - > ne [ 2 ] , block_1 - > ne [ 3 ] ) ;
block_1 = ggml_cont ( ctx0 , ggml_permute ( ctx0 , block_1 , 1 , 0 , 2 , 3 ) ) ;
// block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
block_1 = ggml_mul_mat ( ctx0 , model . mm_model_block_1_block_2_0_w , block_1 ) ;
block_1 = ggml_reshape_4d ( ctx0 , block_1 , block_1 - > ne [ 0 ] , w , h , block_1 - > ne [ 3 ] ) ;
// block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
block_1 = ggml_norm ( ctx0 , block_1 , eps ) ;
block_1 = ggml_add ( ctx0 , ggml_mul ( ctx0 , block_1 , model . mm_model_block_1_block_2_1_w ) , model . mm_model_block_1_block_2_1_b ) ;
block_1 = ggml_cont ( ctx0 , ggml_permute ( ctx0 , block_1 , 2 , 0 , 1 , 3 ) ) ;
// block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
// residual
block_1 = ggml_add ( ctx0 , mlp_3 , block_1 ) ;
}
2023-10-12 17:23:18 +02:00
2024-01-22 14:09:35 +01:00
// block_2
{
// stride = 2
2024-01-23 13:12:57 +01:00
block_1 = ggml_conv_depthwise_2d ( ctx0 , model . mm_model_block_2_block_0_0_w , block_1 , 2 , 2 , 1 , 1 , 1 , 1 ) ;
2024-01-22 14:09:35 +01:00
// block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
// layer norm
block_1 = ggml_cont ( ctx0 , ggml_permute ( ctx0 , block_1 , 1 , 2 , 0 , 3 ) ) ;
// block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
block_1 = ggml_norm ( ctx0 , block_1 , eps ) ;
block_1 = ggml_add ( ctx0 , ggml_mul ( ctx0 , block_1 , model . mm_model_block_2_block_0_1_w ) , model . mm_model_block_2_block_0_1_b ) ;
block_1 = ggml_cont ( ctx0 , ggml_permute ( ctx0 , block_1 , 2 , 0 , 1 , 3 ) ) ;
// block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
// hardswish
struct ggml_tensor * block_1_hw = ggml_hardswish ( ctx0 , block_1 ) ;
// not sure the parameters is right for globalAvgPooling
block_1 = ggml_pool_2d ( ctx0 , block_1_hw , GGML_OP_POOL_AVG , block_1_hw - > ne [ 0 ] , block_1_hw - > ne [ 1 ] , block_1_hw - > ne [ 0 ] , block_1_hw - > ne [ 1 ] , 0 , 0 ) ;
// block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
// pointwise conv
block_1 = ggml_reshape_2d ( ctx0 , block_1 , block_1 - > ne [ 0 ] * block_1 - > ne [ 1 ] * block_1 - > ne [ 2 ] , block_1 - > ne [ 3 ] ) ;
block_1 = ggml_mul_mat ( ctx0 , model . mm_model_block_2_block_1_fc1_w , block_1 ) ;
block_1 = ggml_add ( ctx0 , block_1 , model . mm_model_block_2_block_1_fc1_b ) ;
block_1 = ggml_relu ( ctx0 , block_1 ) ;
block_1 = ggml_mul_mat ( ctx0 , model . mm_model_block_2_block_1_fc2_w , block_1 ) ;
block_1 = ggml_add ( ctx0 , block_1 , model . mm_model_block_2_block_1_fc2_b ) ;
block_1 = ggml_hardsigmoid ( ctx0 , block_1 ) ;
// block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
block_1 = ggml_reshape_4d ( ctx0 , block_1 , 1 , 1 , block_1 - > ne [ 0 ] , block_1 - > ne [ 1 ] ) ;
block_1 = ggml_mul ( ctx0 , block_1_hw , block_1 ) ;
int w = block_1 - > ne [ 0 ] , h = block_1 - > ne [ 1 ] ;
block_1 = ggml_reshape_3d ( ctx0 , block_1 , w * h , block_1 - > ne [ 2 ] , block_1 - > ne [ 3 ] ) ;
block_1 = ggml_cont ( ctx0 , ggml_permute ( ctx0 , block_1 , 1 , 0 , 2 , 3 ) ) ;
// block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
block_1 = ggml_mul_mat ( ctx0 , model . mm_model_block_2_block_2_0_w , block_1 ) ;
block_1 = ggml_reshape_4d ( ctx0 , block_1 , block_1 - > ne [ 0 ] , w , h , block_1 - > ne [ 3 ] ) ;
// block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
block_1 = ggml_norm ( ctx0 , block_1 , eps ) ;
block_1 = ggml_add ( ctx0 , ggml_mul ( ctx0 , block_1 , model . mm_model_block_2_block_2_1_w ) , model . mm_model_block_2_block_2_1_b ) ;
block_1 = ggml_reshape_3d ( ctx0 , block_1 , block_1 - > ne [ 0 ] , block_1 - > ne [ 1 ] * block_1 - > ne [ 2 ] , block_1 - > ne [ 3 ] ) ;
// block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
}
embeddings = block_1 ;
}
else {
GGML_ASSERT ( false ) ;
}
2023-10-12 17:23:18 +02:00
}
// build the graph
ggml_build_forward_expand ( gf , embeddings ) ;
ggml_free ( ctx0 ) ;
return gf ;
}
// read and create ggml_context containing the tensors and their data
struct clip_ctx * clip_model_load ( const char * fname , const int verbosity = 1 ) {
struct ggml_context * meta = NULL ;
struct gguf_init_params params = {
/*.no_alloc = */ true ,
/*.ctx = */ & meta ,
} ;
struct gguf_context * ctx = gguf_init_from_file ( fname , params ) ;
2023-10-19 15:59:11 +02:00
if ( ! ctx ) {
throw std : : runtime_error ( format ( " %s: failed to load CLIP model from %s. Does this file exist? \n " , __func__ , fname ) ) ;
}
2023-10-12 17:23:18 +02:00
if ( verbosity > = 1 ) {
const int n_tensors = gguf_get_n_tensors ( ctx ) ;
const int n_kv = gguf_get_n_kv ( ctx ) ;
const int ftype = get_u32 ( ctx , KEY_FTYPE ) ;
const std : : string ftype_str = get_ftype ( ftype ) ;
const int idx_desc = get_key_idx ( ctx , KEY_DESCRIPTION ) ;
const std : : string description = gguf_get_val_str ( ctx , idx_desc ) ;
const int idx_name = gguf_find_key ( ctx , KEY_NAME ) ;
if ( idx_name ! = - 1 ) { // make name optional temporarily as some of the uploaded models missing it due to a bug
const std : : string name = gguf_get_val_str ( ctx , idx_name ) ;
printf ( " %s: model name: %s \n " , __func__ , name . c_str ( ) ) ;
}
printf ( " %s: description: %s \n " , __func__ , description . c_str ( ) ) ;
printf ( " %s: GGUF version: %d \n " , __func__ , gguf_get_version ( ctx ) ) ;
printf ( " %s: alignment: %zu \n " , __func__ , gguf_get_alignment ( ctx ) ) ;
printf ( " %s: n_tensors: %d \n " , __func__ , n_tensors ) ;
printf ( " %s: n_kv: %d \n " , __func__ , n_kv ) ;
printf ( " %s: ftype: %s \n " , __func__ , ftype_str . c_str ( ) ) ;
printf ( " \n " ) ;
}
2023-12-29 17:52:15 +01:00
const int n_tensors = gguf_get_n_tensors ( ctx ) ;
2024-01-22 14:09:35 +01:00
2023-10-12 17:23:18 +02:00
// kv
2024-01-22 14:09:35 +01:00
const int n_kv = gguf_get_n_kv ( ctx ) ;
printf ( " %s: loaded meta data with %d key-value pairs and %d tensors from %s \n " ,
__func__ , n_kv , n_tensors , fname ) ;
{
std : : map < enum ggml_type , uint32_t > n_type ;
for ( int i = 0 ; i < n_tensors ; i + + ) {
enum ggml_type type = gguf_get_tensor_type ( ctx , i ) ;
2023-10-12 17:23:18 +02:00
2024-01-22 14:09:35 +01:00
n_type [ type ] + + ;
}
printf ( " %s: Dumping metadata keys/values. Note: KV overrides do not apply in this output. \n " , __func__ ) ;
for ( int i = 0 ; i < n_kv ; i + + ) {
const char * name = gguf_get_key ( ctx , i ) ;
const enum gguf_type type = gguf_get_kv_type ( ctx , i ) ;
const std : : string type_name =
type = = GGUF_TYPE_ARRAY
? format ( " %s[%s,%d] " , gguf_type_name ( type ) , gguf_type_name ( gguf_get_arr_type ( ctx , i ) ) , gguf_get_arr_n ( ctx , i ) )
: gguf_type_name ( type ) ;
std : : string value = gguf_kv_to_str ( ctx , i ) ;
const size_t MAX_VALUE_LEN = 40 ;
if ( value . size ( ) > MAX_VALUE_LEN ) {
value = format ( " %s... " , value . substr ( 0 , MAX_VALUE_LEN - 3 ) . c_str ( ) ) ;
}
replace_all ( value , " \n " , " \\ n " ) ;
printf ( " %s: - kv %3d: %42s %-16s = %s \n " , __func__ , i , name , type_name . c_str ( ) , value . c_str ( ) ) ;
}
// print type counts
for ( auto & kv : n_type ) {
if ( kv . second = = 0 ) {
continue ;
}
printf ( " %s: - type %4s: %4d tensors \n " , __func__ , ggml_type_name ( kv . first ) , kv . second ) ;
2023-10-12 17:23:18 +02:00
}
}
// data
2024-02-12 08:16:06 +01:00
size_t model_size = 0 ;
2023-10-12 17:23:18 +02:00
{
for ( int i = 0 ; i < n_tensors ; + + i ) {
const char * name = gguf_get_tensor_name ( ctx , i ) ;
const size_t offset = gguf_get_tensor_offset ( ctx , i ) ;
2024-01-22 14:09:35 +01:00
enum ggml_type type = gguf_get_tensor_type ( ctx , i ) ;
2023-10-12 17:23:18 +02:00
struct ggml_tensor * cur = ggml_get_tensor ( meta , name ) ;
size_t tensor_size = ggml_nbytes ( cur ) ;
2024-02-12 08:16:06 +01:00
model_size + = tensor_size ;
2023-10-12 17:23:18 +02:00
if ( verbosity > = 3 ) {
2024-01-23 13:12:57 +01:00
printf ( " %s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[% " PRIu64 " , % " PRIu64 " , % " PRIu64 " , % " PRIu64 " ], type = %s \n " ,
__func__ , i , ggml_n_dims ( cur ) , cur - > name , tensor_size , offset , cur - > ne [ 0 ] , cur - > ne [ 1 ] , cur - > ne [ 2 ] , cur - > ne [ 3 ] , ggml_type_name ( type ) ) ;
2023-10-12 17:23:18 +02:00
}
}
}
clip_ctx * new_clip = new clip_ctx ;
2024-01-10 14:37:09 +01:00
2024-01-22 14:09:35 +01:00
// update projector type
{
int idx = gguf_find_key ( ctx , KEY_PROJ_TYPE ) ;
if ( idx ! = - 1 ) {
const std : : string proj_type = gguf_get_val_str ( ctx , idx ) ;
new_clip - > proj_type = clip_projector_type_from_string ( proj_type ) ;
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
} else {
2024-01-22 14:09:35 +01:00
new_clip - > proj_type = PROJECTOR_TYPE_MLP ;
}
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
2024-01-27 16:09:18 +01:00
if ( new_clip - > proj_type = = PROJECTOR_TYPE_MLP ) {
if ( gguf_find_tensor ( ctx , format ( TN_LLAVA_PROJ , 3 , " weight " ) . c_str ( ) ) ! = - 1 ) {
new_clip - > proj_type = PROJECTOR_TYPE_MLP_NORM ;
}
}
2024-01-22 14:09:35 +01:00
}
2023-12-29 17:52:15 +01:00
# ifdef GGML_USE_CUBLAS
new_clip - > backend = ggml_backend_cuda_init ( 0 ) ;
printf ( " %s: CLIP using CUDA backend \n " , __func__ ) ;
# endif
# ifdef GGML_USE_METAL
new_clip - > backend = ggml_backend_metal_init ( ) ;
printf ( " %s: CLIP using Metal backend \n " , __func__ ) ;
# endif
2024-01-10 14:37:09 +01:00
2023-12-29 17:52:15 +01:00
if ( ! new_clip - > backend ) {
new_clip - > backend = ggml_backend_cpu_init ( ) ;
printf ( " %s: CLIP using CPU backend \n " , __func__ ) ;
}
2023-10-12 17:23:18 +02:00
// model size and capabilities
{
int idx = get_key_idx ( ctx , KEY_HAS_TEXT_ENC ) ;
new_clip - > has_text_encoder = gguf_get_val_bool ( ctx , idx ) ;
idx = get_key_idx ( ctx , KEY_HAS_VIS_ENC ) ;
new_clip - > has_vision_encoder = gguf_get_val_bool ( ctx , idx ) ;
idx = gguf_find_key ( ctx , KEY_HAS_LLAVA_PROJ ) ;
if ( idx ! = - 1 ) {
new_clip - > has_llava_projector = gguf_get_val_bool ( ctx , idx ) ;
}
GGML_ASSERT ( new_clip - > has_llava_projector ) ; // see monatis/clip.cpp for image and/or text encoding for semantic search
GGML_ASSERT ( new_clip - > has_vision_encoder ) ;
GGML_ASSERT ( ! new_clip - > has_text_encoder ) ;
idx = get_key_idx ( ctx , KEY_USE_GELU ) ;
new_clip - > use_gelu = gguf_get_val_bool ( ctx , idx ) ;
if ( verbosity > = 1 ) {
printf ( " %s: text_encoder: %d \n " , __func__ , new_clip - > has_text_encoder ) ;
printf ( " %s: vision_encoder: %d \n " , __func__ , new_clip - > has_vision_encoder ) ;
printf ( " %s: llava_projector: %d \n " , __func__ , new_clip - > has_llava_projector ) ;
2024-02-12 08:16:06 +01:00
printf ( " %s: model size: %.2f MB \n " , __func__ , model_size / 1024.0 / 1024.0 ) ;
2023-10-12 17:23:18 +02:00
printf ( " %s: metadata size: %.2f MB \n " , __func__ , ggml_get_mem_size ( meta ) / 1024.0 / 1024.0 ) ;
}
}
2024-02-12 08:16:06 +01:00
printf ( " %s: params backend buffer size = % 6.2f MB (%i tensors) \n " , __func__ , model_size / ( 1024.0 * 1024.0 ) , n_tensors ) ;
2023-12-29 17:52:15 +01:00
2023-10-12 17:23:18 +02:00
// load tensors
{
2023-12-29 17:52:15 +01:00
std : : vector < uint8_t > read_buf ;
2023-10-12 17:23:18 +02:00
struct ggml_init_params params = {
2023-12-29 17:52:15 +01:00
/*.mem_size =*/ ( n_tensors + 1 ) * ggml_tensor_overhead ( ) ,
2023-10-12 17:23:18 +02:00
/*.mem_buffer =*/ NULL ,
2023-12-29 17:52:15 +01:00
/*.no_alloc =*/ true ,
2023-10-12 17:23:18 +02:00
} ;
2023-12-30 22:24:42 +01:00
new_clip - > ctx_data = ggml_init ( params ) ;
if ( ! new_clip - > ctx_data ) {
2023-10-12 17:23:18 +02:00
fprintf ( stderr , " %s: ggml_init() failed \n " , __func__ ) ;
clip_free ( new_clip ) ;
2024-03-14 19:29:32 +01:00
gguf_free ( ctx ) ;
2023-10-12 17:23:18 +02:00
return nullptr ;
}
auto fin = std : : ifstream ( fname , std : : ios : : binary ) ;
if ( ! fin ) {
printf ( " cannot open model file for loading tensors \n " ) ;
clip_free ( new_clip ) ;
2024-03-14 19:29:32 +01:00
gguf_free ( ctx ) ;
2023-10-12 17:23:18 +02:00
return nullptr ;
}
2023-12-29 17:52:15 +01:00
// add tensors to context
2023-10-12 17:23:18 +02:00
for ( int i = 0 ; i < n_tensors ; + + i ) {
const char * name = gguf_get_tensor_name ( ctx , i ) ;
struct ggml_tensor * t = ggml_get_tensor ( meta , name ) ;
2023-12-30 22:24:42 +01:00
struct ggml_tensor * cur = ggml_dup_tensor ( new_clip - > ctx_data , t ) ;
2023-10-12 17:23:18 +02:00
ggml_set_name ( cur , name ) ;
2023-12-29 17:52:15 +01:00
}
2023-10-12 17:23:18 +02:00
2023-12-29 17:52:15 +01:00
// alloc memory and offload data
2024-02-12 08:16:06 +01:00
new_clip - > params_buffer = ggml_backend_alloc_ctx_tensors ( new_clip - > ctx_data , new_clip - > backend ) ;
2023-12-29 17:52:15 +01:00
for ( int i = 0 ; i < n_tensors ; + + i ) {
const char * name = gguf_get_tensor_name ( ctx , i ) ;
2023-12-30 22:24:42 +01:00
struct ggml_tensor * cur = ggml_get_tensor ( new_clip - > ctx_data , name ) ;
2023-10-12 17:23:18 +02:00
const size_t offset = gguf_get_data_offset ( ctx ) + gguf_get_tensor_offset ( ctx , i ) ;
fin . seekg ( offset , std : : ios : : beg ) ;
if ( ! fin ) {
printf ( " %s: failed to seek for tensor %s \n " , __func__ , name ) ;
clip_free ( new_clip ) ;
2024-03-14 19:29:32 +01:00
gguf_free ( ctx ) ;
2023-10-12 17:23:18 +02:00
return nullptr ;
}
2023-12-29 17:52:15 +01:00
int num_bytes = ggml_nbytes ( cur ) ;
2023-12-29 17:53:34 +01:00
if ( ggml_backend_buffer_is_host ( new_clip - > params_buffer ) ) {
2023-12-29 17:52:15 +01:00
// for the CPU and Metal backend, we can read directly into the tensor
fin . read ( reinterpret_cast < char * > ( cur - > data ) , num_bytes ) ;
} else {
// read into a temporary buffer first, then copy to device memory
read_buf . resize ( num_bytes ) ;
fin . read ( reinterpret_cast < char * > ( read_buf . data ( ) ) , num_bytes ) ;
ggml_backend_tensor_set ( cur , read_buf . data ( ) , 0 , num_bytes ) ;
}
2023-10-12 17:23:18 +02:00
}
fin . close ( ) ;
}
// vision model
if ( new_clip - > has_vision_encoder ) {
// load vision model
auto & vision_model = new_clip - > vision_model ;
auto & hparams = vision_model . hparams ;
2023-12-30 22:24:42 +01:00
hparams . hidden_size = get_u32 ( ctx , format ( KEY_N_EMBD , " vision " ) ) ;
hparams . n_head = get_u32 ( ctx , format ( KEY_N_HEAD , " vision " ) ) ;
2023-10-12 17:23:18 +02:00
hparams . n_intermediate = get_u32 ( ctx , format ( KEY_N_FF , " vision " ) ) ;
2023-12-30 22:24:42 +01:00
hparams . n_layer = get_u32 ( ctx , format ( KEY_N_BLOCK , " vision " ) ) ;
hparams . image_size = get_u32 ( ctx , KEY_IMAGE_SIZE ) ;
hparams . patch_size = get_u32 ( ctx , KEY_PATCH_SIZE ) ;
2023-10-12 17:23:18 +02:00
hparams . projection_dim = get_u32 ( ctx , format ( KEY_PROJ_DIM , " vision " ) ) ;
2023-12-30 22:24:42 +01:00
hparams . eps = get_f32 ( ctx , format ( KEY_LAYER_NORM_EPS , " vision " ) ) ;
2023-10-12 17:23:18 +02:00
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
try {
int idx = get_key_idx ( ctx , KEY_IMAGE_GRID_PINPOINTS ) ;
int n = gguf_get_arr_n ( ctx , idx ) ;
const int32_t * pinpoints = ( const int32_t * ) gguf_get_arr_data ( ctx , idx ) ;
for ( int i = 0 ; i < 32 & & i < n & & pinpoints [ i ] ! = 0 ; + + i ) {
hparams . image_grid_pinpoints [ i ] = pinpoints [ i ] ;
}
if ( n < 32 )
hparams . image_grid_pinpoints [ n ] = 0 ;
} catch ( std : : runtime_error & e ) {
hparams . image_grid_pinpoints [ 0 ] = 0 ;
}
try {
int idx = get_key_idx ( ctx , KEY_MM_PATCH_MERGE_TYPE ) ;
strcpy ( hparams . mm_patch_merge_type , gguf_get_val_str ( ctx , idx ) ) ;
} catch ( std : : runtime_error & e ) {
strcpy ( hparams . mm_patch_merge_type , " flat " ) ;
}
try {
hparams . image_crop_resolution = get_u32 ( ctx , KEY_IMAGE_CROP_RESOLUTION ) ; // llava-1.6
} catch ( const std : : exception & e ) {
hparams . image_crop_resolution = hparams . image_size ;
}
2023-10-12 17:23:18 +02:00
int idx_mean = get_key_idx ( ctx , KEY_IMAGE_MEAN ) ;
2023-12-30 22:24:42 +01:00
int idx_std = get_key_idx ( ctx , KEY_IMAGE_STD ) ;
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
const float * mean_data = ( const float * ) gguf_get_arr_data ( ctx , idx_mean ) ;
const float * std_data = ( const float * ) gguf_get_arr_data ( ctx , idx_std ) ;
2023-10-12 17:23:18 +02:00
for ( int i = 0 ; i < 3 ; + + i ) {
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
new_clip - > image_mean [ i ] = mean_data [ i ] ;
new_clip - > image_std [ i ] = std_data [ i ] ;
2023-10-12 17:23:18 +02:00
}
if ( verbosity > = 2 ) {
printf ( " \n %s: vision model hparams \n " , __func__ ) ;
printf ( " image_size %d \n " , hparams . image_size ) ;
printf ( " patch_size %d \n " , hparams . patch_size ) ;
printf ( " v_hidden_size %d \n " , hparams . hidden_size ) ;
printf ( " v_n_intermediate %d \n " , hparams . n_intermediate ) ;
printf ( " v_projection_dim %d \n " , hparams . projection_dim ) ;
printf ( " v_n_head %d \n " , hparams . n_head ) ;
printf ( " v_n_layer %d \n " , hparams . n_layer ) ;
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
printf ( " v_eps %f \n " , hparams . eps ) ;
printf ( " v_image_mean %f %f %f \n " , new_clip - > image_mean [ 0 ] , new_clip - > image_mean [ 1 ] , new_clip - > image_mean [ 2 ] ) ;
printf ( " v_image_std %f %f %f \n " , new_clip - > image_std [ 0 ] , new_clip - > image_std [ 1 ] , new_clip - > image_std [ 2 ] ) ;
printf ( " v_image_grid_pinpoints: " ) ;
2024-02-15 17:49:08 +01:00
for ( int i = 0 ; i < 32 & & ( hparams . image_grid_pinpoints [ i ] ! = 0 ) ; + + i ) {
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
printf ( " %d " , hparams . image_grid_pinpoints [ i ] ) ;
}
printf ( " \n " ) ;
printf ( " v_mm_patch_merge_type: %s \n " , hparams . mm_patch_merge_type ) ;
2023-10-12 17:23:18 +02:00
}
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
try {
vision_model . patch_embeddings = get_tensor ( new_clip - > ctx_data , TN_PATCH_EMBD ) ;
vision_model . class_embedding = get_tensor ( new_clip - > ctx_data , TN_CLASS_EMBD ) ;
vision_model . position_embeddings = get_tensor ( new_clip - > ctx_data , format ( TN_POS_EMBD , " v " ) ) ;
vision_model . pre_ln_w = get_tensor ( new_clip - > ctx_data , format ( TN_LN_PRE , " v " , " weight " ) ) ;
vision_model . pre_ln_b = get_tensor ( new_clip - > ctx_data , format ( TN_LN_PRE , " v " , " bias " ) ) ;
} catch ( const std : : exception & e ) {
fprintf ( stderr , " %s: failed to load vision model tensors \n " , __func__ ) ;
}
2024-01-22 14:09:35 +01:00
// LLaVA projection
2024-01-27 16:09:18 +01:00
if ( new_clip - > proj_type = = PROJECTOR_TYPE_MLP | | new_clip - > proj_type = = PROJECTOR_TYPE_MLP_NORM ) {
2024-01-22 14:09:35 +01:00
vision_model . mm_0_w = get_tensor ( new_clip - > ctx_data , format ( TN_LLAVA_PROJ , 0 , " weight " ) ) ;
vision_model . mm_0_b = get_tensor ( new_clip - > ctx_data , format ( TN_LLAVA_PROJ , 0 , " bias " ) ) ;
2024-01-27 16:09:18 +01:00
try {
// Yi-type llava
vision_model . mm_1_w = get_tensor ( new_clip - > ctx_data , format ( TN_LLAVA_PROJ , 1 , " weight " ) ) ;
vision_model . mm_1_b = get_tensor ( new_clip - > ctx_data , format ( TN_LLAVA_PROJ , 1 , " bias " ) ) ;
} catch ( std : : runtime_error & e ) { }
try {
// missing in Yi-type llava
vision_model . mm_2_w = get_tensor ( new_clip - > ctx_data , format ( TN_LLAVA_PROJ , 2 , " weight " ) ) ;
vision_model . mm_2_b = get_tensor ( new_clip - > ctx_data , format ( TN_LLAVA_PROJ , 2 , " bias " ) ) ;
} catch ( std : : runtime_error & e ) { }
try {
// Yi-type llava
vision_model . mm_3_w = get_tensor ( new_clip - > ctx_data , format ( TN_LLAVA_PROJ , 3 , " weight " ) ) ;
vision_model . mm_3_b = get_tensor ( new_clip - > ctx_data , format ( TN_LLAVA_PROJ , 3 , " bias " ) ) ;
} catch ( std : : runtime_error & e ) { }
try {
// Yi-type llava
vision_model . mm_4_w = get_tensor ( new_clip - > ctx_data , format ( TN_LLAVA_PROJ , 4 , " weight " ) ) ;
vision_model . mm_4_b = get_tensor ( new_clip - > ctx_data , format ( TN_LLAVA_PROJ , 4 , " bias " ) ) ;
} catch ( std : : runtime_error & e ) { }
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
try {
vision_model . image_newline = get_tensor ( new_clip - > ctx_data , TN_IMAGE_NEWLINE ) ;
// fprintf(stderr, "%s: image_newline tensor (llava-1.6) found\n", __func__);
} catch ( std : : runtime_error & e ) { }
} else if ( new_clip - > proj_type = = PROJECTOR_TYPE_LDP ) {
2024-01-22 14:09:35 +01:00
// MobileVLM projection
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
vision_model . mm_model_mlp_1_w = get_tensor ( new_clip - > ctx_data , format ( TN_MVLM_PROJ_MLP , 1 , " weight " ) ) ;
vision_model . mm_model_mlp_1_b = get_tensor ( new_clip - > ctx_data , format ( TN_MVLM_PROJ_MLP , 1 , " bias " ) ) ;
vision_model . mm_model_mlp_3_w = get_tensor ( new_clip - > ctx_data , format ( TN_MVLM_PROJ_MLP , 3 , " weight " ) ) ;
vision_model . mm_model_mlp_3_b = get_tensor ( new_clip - > ctx_data , format ( TN_MVLM_PROJ_MLP , 3 , " bias " ) ) ;
vision_model . mm_model_block_1_block_0_0_w = get_tensor ( new_clip - > ctx_data , format ( TN_MVLM_PROJ_BLOCK , 1 , 0 , " 0.weight " ) ) ;
vision_model . mm_model_block_1_block_0_1_w = get_tensor ( new_clip - > ctx_data , format ( TN_MVLM_PROJ_BLOCK , 1 , 0 , " 1.weight " ) ) ;
vision_model . mm_model_block_1_block_0_1_b = get_tensor ( new_clip - > ctx_data , format ( TN_MVLM_PROJ_BLOCK , 1 , 0 , " 1.bias " ) ) ;
2024-01-22 14:09:35 +01:00
vision_model . mm_model_block_1_block_1_fc1_w = get_tensor ( new_clip - > ctx_data , format ( TN_MVLM_PROJ_BLOCK , 1 , 1 , " fc1.weight " ) ) ;
vision_model . mm_model_block_1_block_1_fc1_b = get_tensor ( new_clip - > ctx_data , format ( TN_MVLM_PROJ_BLOCK , 1 , 1 , " fc1.bias " ) ) ;
vision_model . mm_model_block_1_block_1_fc2_w = get_tensor ( new_clip - > ctx_data , format ( TN_MVLM_PROJ_BLOCK , 1 , 1 , " fc2.weight " ) ) ;
vision_model . mm_model_block_1_block_1_fc2_b = get_tensor ( new_clip - > ctx_data , format ( TN_MVLM_PROJ_BLOCK , 1 , 1 , " fc2.bias " ) ) ;
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
vision_model . mm_model_block_1_block_2_0_w = get_tensor ( new_clip - > ctx_data , format ( TN_MVLM_PROJ_BLOCK , 1 , 2 , " 0.weight " ) ) ;
vision_model . mm_model_block_1_block_2_1_w = get_tensor ( new_clip - > ctx_data , format ( TN_MVLM_PROJ_BLOCK , 1 , 2 , " 1.weight " ) ) ;
vision_model . mm_model_block_1_block_2_1_b = get_tensor ( new_clip - > ctx_data , format ( TN_MVLM_PROJ_BLOCK , 1 , 2 , " 1.bias " ) ) ;
vision_model . mm_model_block_2_block_0_0_w = get_tensor ( new_clip - > ctx_data , format ( TN_MVLM_PROJ_BLOCK , 2 , 0 , " 0.weight " ) ) ;
vision_model . mm_model_block_2_block_0_1_w = get_tensor ( new_clip - > ctx_data , format ( TN_MVLM_PROJ_BLOCK , 2 , 0 , " 1.weight " ) ) ;
vision_model . mm_model_block_2_block_0_1_b = get_tensor ( new_clip - > ctx_data , format ( TN_MVLM_PROJ_BLOCK , 2 , 0 , " 1.bias " ) ) ;
2024-01-22 14:09:35 +01:00
vision_model . mm_model_block_2_block_1_fc1_w = get_tensor ( new_clip - > ctx_data , format ( TN_MVLM_PROJ_BLOCK , 2 , 1 , " fc1.weight " ) ) ;
vision_model . mm_model_block_2_block_1_fc1_b = get_tensor ( new_clip - > ctx_data , format ( TN_MVLM_PROJ_BLOCK , 2 , 1 , " fc1.bias " ) ) ;
vision_model . mm_model_block_2_block_1_fc2_w = get_tensor ( new_clip - > ctx_data , format ( TN_MVLM_PROJ_BLOCK , 2 , 1 , " fc2.weight " ) ) ;
vision_model . mm_model_block_2_block_1_fc2_b = get_tensor ( new_clip - > ctx_data , format ( TN_MVLM_PROJ_BLOCK , 2 , 1 , " fc2.bias " ) ) ;
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
vision_model . mm_model_block_2_block_2_0_w = get_tensor ( new_clip - > ctx_data , format ( TN_MVLM_PROJ_BLOCK , 2 , 2 , " 0.weight " ) ) ;
vision_model . mm_model_block_2_block_2_1_w = get_tensor ( new_clip - > ctx_data , format ( TN_MVLM_PROJ_BLOCK , 2 , 2 , " 1.weight " ) ) ;
vision_model . mm_model_block_2_block_2_1_b = get_tensor ( new_clip - > ctx_data , format ( TN_MVLM_PROJ_BLOCK , 2 , 2 , " 1.bias " ) ) ;
} else {
2024-01-22 14:09:35 +01:00
std : : string proj_type = PROJECTOR_TYPE_NAMES [ new_clip - > proj_type ] ;
throw std : : runtime_error ( format ( " %s: don't support projector with: %s currently \n " , __func__ , proj_type . c_str ( ) ) ) ;
}
2023-10-12 17:23:18 +02:00
vision_model . layers . resize ( hparams . n_layer ) ;
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
2023-10-12 17:23:18 +02:00
for ( int il = 0 ; il < hparams . n_layer ; + + il ) {
auto & layer = vision_model . layers [ il ] ;
2023-12-30 22:24:42 +01:00
layer . k_w = get_tensor ( new_clip - > ctx_data , format ( TN_ATTN_K , " v " , il , " weight " ) ) ;
layer . q_w = get_tensor ( new_clip - > ctx_data , format ( TN_ATTN_Q , " v " , il , " weight " ) ) ;
layer . v_w = get_tensor ( new_clip - > ctx_data , format ( TN_ATTN_V , " v " , il , " weight " ) ) ;
layer . o_w = get_tensor ( new_clip - > ctx_data , format ( TN_ATTN_OUTPUT , " v " , il , " weight " ) ) ;
layer . ln_1_w = get_tensor ( new_clip - > ctx_data , format ( TN_LN_1 , " v " , il , " weight " ) ) ;
layer . ln_2_w = get_tensor ( new_clip - > ctx_data , format ( TN_LN_2 , " v " , il , " weight " ) ) ;
layer . ff_i_w = get_tensor ( new_clip - > ctx_data , format ( TN_FFN_DOWN , " v " , il , " weight " ) ) ;
layer . ff_o_w = get_tensor ( new_clip - > ctx_data , format ( TN_FFN_UP , " v " , il , " weight " ) ) ;
layer . k_b = get_tensor ( new_clip - > ctx_data , format ( TN_ATTN_K , " v " , il , " bias " ) ) ;
layer . q_b = get_tensor ( new_clip - > ctx_data , format ( TN_ATTN_Q , " v " , il , " bias " ) ) ;
layer . v_b = get_tensor ( new_clip - > ctx_data , format ( TN_ATTN_V , " v " , il , " bias " ) ) ;
layer . o_b = get_tensor ( new_clip - > ctx_data , format ( TN_ATTN_OUTPUT , " v " , il , " bias " ) ) ;
layer . ln_1_b = get_tensor ( new_clip - > ctx_data , format ( TN_LN_1 , " v " , il , " bias " ) ) ;
layer . ln_2_b = get_tensor ( new_clip - > ctx_data , format ( TN_LN_2 , " v " , il , " bias " ) ) ;
layer . ff_i_b = get_tensor ( new_clip - > ctx_data , format ( TN_FFN_DOWN , " v " , il , " bias " ) ) ;
layer . ff_o_b = get_tensor ( new_clip - > ctx_data , format ( TN_FFN_UP , " v " , il , " bias " ) ) ;
2023-10-12 17:23:18 +02:00
}
}
ggml_free ( meta ) ;
new_clip - > ctx_gguf = ctx ;
2023-12-30 22:24:42 +01:00
// measure mem requirement and allocate
2023-10-12 17:23:18 +02:00
{
2023-12-30 22:24:42 +01:00
new_clip - > buf_compute_meta . resize ( GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead ( ) + ggml_graph_overhead ( ) ) ;
2024-02-12 08:16:06 +01:00
new_clip - > compute_alloc = ggml_gallocr_new ( ggml_backend_get_default_buffer_type ( new_clip - > backend ) ) ;
2023-10-12 17:23:18 +02:00
clip_image_f32_batch batch ;
batch . size = 1 ;
ggml_cgraph * gf = clip_image_build_graph ( new_clip , & batch ) ;
2024-02-12 08:16:06 +01:00
ggml_gallocr_reserve ( new_clip - > compute_alloc , gf ) ;
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size ( new_clip - > compute_alloc , 0 ) ;
2023-12-29 17:52:15 +01:00
printf ( " %s: compute allocated memory: %.2f MB \n " , __func__ , compute_memory_buffer_size / 1024.0 / 1024.0 ) ;
2023-10-12 17:23:18 +02:00
}
return new_clip ;
}
2023-12-30 22:24:42 +01:00
struct clip_image_u8 * clip_image_u8_init ( ) {
return new clip_image_u8 ( ) ;
}
struct clip_image_f32 * clip_image_f32_init ( ) {
return new clip_image_f32 ( ) ;
2023-11-06 22:36:23 +01:00
}
2023-10-12 17:23:18 +02:00
2024-02-15 09:01:57 +01:00
void clip_image_u8_free ( struct clip_image_u8 * img ) { delete img ; }
2023-12-30 22:24:42 +01:00
void clip_image_f32_free ( struct clip_image_f32 * img ) { delete img ; }
2024-02-15 09:01:57 +01:00
void clip_image_u8_batch_free ( struct clip_image_u8_batch & batch ) {
if ( batch . size > 0 ) {
delete [ ] batch . data ;
batch . size = 0 ;
}
}
void clip_image_f32_batch_free ( struct clip_image_f32_batch & batch ) {
if ( batch . size > 0 ) {
delete [ ] batch . data ;
batch . size = 0 ;
}
}
2023-10-12 17:23:18 +02:00
2023-11-06 22:36:23 +01:00
static void build_clip_img_from_data ( const stbi_uc * data , int nx , int ny , clip_image_u8 * img ) {
2023-10-12 17:23:18 +02:00
img - > nx = nx ;
img - > ny = ny ;
2023-12-30 22:24:42 +01:00
img - > buf . resize ( 3 * nx * ny ) ;
memcpy ( img - > buf . data ( ) , data , img - > buf . size ( ) ) ;
2023-11-06 22:36:23 +01:00
}
2023-10-12 17:23:18 +02:00
2023-11-06 22:36:23 +01:00
bool clip_image_load_from_file ( const char * fname , clip_image_u8 * img ) {
int nx , ny , nc ;
2023-12-30 22:24:42 +01:00
auto * data = stbi_load ( fname , & nx , & ny , & nc , 3 ) ;
2023-11-06 22:36:23 +01:00
if ( ! data ) {
fprintf ( stderr , " %s: failed to load image '%s' \n " , __func__ , fname ) ;
return false ;
}
build_clip_img_from_data ( data , nx , ny , img ) ;
2023-10-12 17:23:18 +02:00
stbi_image_free ( data ) ;
2023-11-06 22:36:23 +01:00
return true ;
}
2023-10-12 17:23:18 +02:00
2023-11-06 22:36:23 +01:00
bool clip_image_load_from_bytes ( const unsigned char * bytes , size_t bytes_length , struct clip_image_u8 * img ) {
int nx , ny , nc ;
2023-12-30 22:24:42 +01:00
auto * data = stbi_load_from_memory ( bytes , bytes_length , & nx , & ny , & nc , 3 ) ;
2023-11-06 22:36:23 +01:00
if ( ! data ) {
fprintf ( stderr , " %s: failed to decode image bytes \n " , __func__ ) ;
return false ;
}
build_clip_img_from_data ( data , nx , ny , img ) ;
stbi_image_free ( data ) ;
2023-10-12 17:23:18 +02:00
return true ;
}
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
// Linear interpolation between two points
inline float lerp ( float s , float e , float t ) {
return s + ( e - s ) * t ;
}
// Bilinear resize function
static void bilinear_resize ( const clip_image_u8 & src , clip_image_u8 & dst , int target_width , int target_height ) {
dst . nx = target_width ;
dst . ny = target_height ;
dst . buf . resize ( 3 * target_width * target_height ) ;
float x_ratio = static_cast < float > ( src . nx - 1 ) / target_width ;
float y_ratio = static_cast < float > ( src . ny - 1 ) / target_height ;
for ( int y = 0 ; y < target_height ; y + + ) {
for ( int x = 0 ; x < target_width ; x + + ) {
float px = x_ratio * x ;
float py = y_ratio * y ;
int x_floor = static_cast < int > ( px ) ;
int y_floor = static_cast < int > ( py ) ;
float x_lerp = px - x_floor ;
float y_lerp = py - y_floor ;
for ( int c = 0 ; c < 3 ; c + + ) {
float top = lerp (
static_cast < float > ( src . buf [ 3 * ( y_floor * src . nx + x_floor ) + c ] ) ,
static_cast < float > ( src . buf [ 3 * ( y_floor * src . nx + ( x_floor + 1 ) ) + c ] ) ,
x_lerp
) ;
float bottom = lerp (
static_cast < float > ( src . buf [ 3 * ( ( y_floor + 1 ) * src . nx + x_floor ) + c ] ) ,
static_cast < float > ( src . buf [ 3 * ( ( y_floor + 1 ) * src . nx + ( x_floor + 1 ) ) + c ] ) ,
x_lerp
) ;
dst . buf [ 3 * ( y * target_width + x ) + c ] = static_cast < uint8_t > ( lerp ( top , bottom , y_lerp ) ) ;
}
}
}
}
// Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not
static void normalize_image_u8_to_f32 ( const clip_image_u8 * src , clip_image_f32 * dst , const float mean [ 3 ] , const float std [ 3 ] ) {
dst - > nx = src - > nx ;
dst - > ny = src - > ny ;
dst - > buf . resize ( src - > buf . size ( ) ) ;
for ( size_t i = 0 ; i < src - > buf . size ( ) ; + + i ) {
int c = i % 3 ; // rgb
dst - > buf [ i ] = ( static_cast < float > ( src - > buf [ i ] ) / 255.0f - mean [ c ] ) / std [ c ] ;
}
}
inline float clip ( float x , float lower , float upper ) {
return std : : max ( lower , std : : min ( x , upper ) ) ;
}
static bool bicubic_resize ( const clip_image_u8 & img , clip_image_u8 & dst , int target_width , int target_height ) {
const int nx = img . nx ;
const int ny = img . ny ;
dst . nx = target_width ;
dst . ny = target_height ;
dst . buf . resize ( 3 * target_width * target_height ) ;
float Cc ;
float C [ 5 ] ;
float d0 , d2 , d3 , a0 , a1 , a2 , a3 ;
int i , j , k , jj ;
int x , y ;
float dx , dy ;
float tx , ty ;
tx = ( float ) nx / ( float ) target_width ;
ty = ( float ) ny / ( float ) target_height ;
// Bicubic interpolation; adapted from ViT.cpp, inspired from :
// -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36
// -> https://en.wikipedia.org/wiki/Bicubic_interpolation
for ( i = 0 ; i < target_height ; i + + ) {
for ( j = 0 ; j < target_width ; j + + ) {
x = ( int ) ( tx * j ) ;
y = ( int ) ( ty * i ) ;
dx = tx * j - x ;
dy = ty * i - y ;
for ( k = 0 ; k < 3 ; k + + ) {
for ( jj = 0 ; jj < = 3 ; jj + + ) {
d0 = img . buf [ ( clip ( y - 1 + jj , 0 , ny - 1 ) * nx + clip ( x - 1 , 0 , nx - 1 ) ) * 3 + k ] - img . buf [ ( clip ( y - 1 + jj , 0 , ny - 1 ) * nx + clip ( x , 0 , nx - 1 ) ) * 3 + k ] ;
d2 = img . buf [ ( clip ( y - 1 + jj , 0 , ny - 1 ) * nx + clip ( x + 1 , 0 , nx - 1 ) ) * 3 + k ] - img . buf [ ( clip ( y - 1 + jj , 0 , ny - 1 ) * nx + clip ( x , 0 , nx - 1 ) ) * 3 + k ] ;
d3 = img . buf [ ( clip ( y - 1 + jj , 0 , ny - 1 ) * nx + clip ( x + 2 , 0 , nx - 1 ) ) * 3 + k ] - img . buf [ ( clip ( y - 1 + jj , 0 , ny - 1 ) * nx + clip ( x , 0 , nx - 1 ) ) * 3 + k ] ;
a0 = img . buf [ ( clip ( y - 1 + jj , 0 , ny - 1 ) * nx + clip ( x , 0 , nx - 1 ) ) * 3 + k ] ;
a1 = - 1.0 / 3 * d0 + d2 - 1.0 / 6 * d3 ;
a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2 ;
a3 = - 1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3 ;
C [ jj ] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx ;
d0 = C [ 0 ] - C [ 1 ] ;
d2 = C [ 2 ] - C [ 1 ] ;
d3 = C [ 3 ] - C [ 1 ] ;
a0 = C [ 1 ] ;
a1 = - 1.0 / 3 * d0 + d2 - 1.0 / 6 * d3 ;
a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2 ;
a3 = - 1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3 ;
Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy ;
const uint8_t Cc2 = std : : min ( std : : max ( std : : round ( Cc ) , 0.0f ) , 255.0f ) ;
dst . buf [ ( i * target_width + j ) * 3 + k ] = float ( Cc2 ) ;
}
}
}
}
return true ;
}
// llava-1.6 type of resize_and_pad (black)
static void resize_and_pad_image ( const clip_image_u8 & image , clip_image_u8 & image_output , const std : : pair < int , int > & target_resolution ) {
int target_width = target_resolution . first ;
int target_height = target_resolution . second ;
float scale_w = static_cast < float > ( target_width ) / image . nx ;
float scale_h = static_cast < float > ( target_height ) / image . ny ;
int new_width , new_height ;
if ( scale_w < scale_h ) {
new_width = target_width ;
new_height = std : : min ( static_cast < int > ( std : : ceil ( image . ny * scale_w ) ) , target_height ) ;
} else {
new_height = target_height ;
new_width = std : : min ( static_cast < int > ( std : : ceil ( image . nx * scale_h ) ) , target_width ) ;
}
clip_image_u8 resized_image ;
// bilinear_resize(image, resized_image, new_width, new_height);
bicubic_resize ( image , resized_image , new_width , new_height ) ;
clip_image_u8 padded_image ;
padded_image . nx = target_width ;
padded_image . ny = target_height ;
padded_image . buf . resize ( 3 * target_width * target_height , 0 ) ; // Initialize with black
// Calculate padding offsets
int pad_x = ( target_width - new_width ) / 2 ;
int pad_y = ( target_height - new_height ) / 2 ;
// Copy the resized image into the center of the padded buffer
for ( int y = 0 ; y < new_height ; + + y ) {
for ( int x = 0 ; x < new_width ; + + x ) {
for ( int c = 0 ; c < 3 ; + + c ) {
padded_image . buf [ 3 * ( ( y + pad_y ) * target_width + ( x + pad_x ) ) + c ] = resized_image . buf [ 3 * ( y * new_width + x ) + c ] ;
}
}
}
image_output = std : : move ( padded_image ) ;
}
/**
* Selects the best resolution from a list of possible resolutions based on the original size .
*
* @ param original_size The original size of the image in the format ( width , height ) .
* @ param possible_resolutions A list of possible resolutions in the format [ ( width1 , height1 ) , ( width2 , height2 ) , . . . ] .
* @ return The best fit resolution in the format ( width , height ) .
*/
static std : : pair < int , int > select_best_resolution ( const std : : pair < int , int > & original_size , const std : : vector < std : : pair < int , int > > & possible_resolutions ) {
int original_width = original_size . first ;
int original_height = original_size . second ;
std : : pair < int , int > best_fit ;
int max_effective_resolution = 0 ;
int min_wasted_resolution = std : : numeric_limits < int > : : max ( ) ;
for ( const auto & resolution : possible_resolutions ) {
int width = resolution . first ;
int height = resolution . second ;
float scale = std : : min ( static_cast < float > ( width ) / original_width , static_cast < float > ( height ) / original_height ) ;
int downscaled_width = static_cast < int > ( original_width * scale ) ;
int downscaled_height = static_cast < int > ( original_height * scale ) ;
int effective_resolution = std : : min ( downscaled_width * downscaled_height , original_width * original_height ) ;
int wasted_resolution = ( width * height ) - effective_resolution ;
// fprintf(stderr, "resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
if ( effective_resolution > max_effective_resolution | | ( effective_resolution = = max_effective_resolution & & wasted_resolution < min_wasted_resolution ) ) {
max_effective_resolution = effective_resolution ;
min_wasted_resolution = wasted_resolution ;
best_fit = resolution ;
}
}
return best_fit ;
}
static std : : vector < clip_image_u8 * > divide_to_patches_u8 ( const clip_image_u8 & image , int patch_size ) {
std : : vector < clip_image_u8 * > patches ;
int width = image . nx ;
int height = image . ny ;
for ( int i = 0 ; i < height ; i + = patch_size ) {
for ( int j = 0 ; j < width ; j + = patch_size ) {
clip_image_u8 * patch = clip_image_u8_init ( ) ;
patch - > nx = std : : min ( patch_size , width - j ) ;
patch - > ny = std : : min ( patch_size , height - i ) ;
patch - > buf . resize ( 3 * patch - > nx * patch - > ny ) ;
for ( int y = 0 ; y < patch - > ny ; + + y ) {
for ( int x = 0 ; x < patch - > nx ; + + x ) {
for ( int c = 0 ; c < 3 ; + + c ) {
patch - > buf [ 3 * ( y * patch - > nx + x ) + c ] = image . buf [ 3 * ( ( i + y ) * width + ( j + x ) ) + c ] ;
}
}
}
patches . push_back ( patch ) ;
}
}
return patches ;
}
// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
// res_imgs memory is being allocated here, previous allocations will be freed if found
bool clip_image_preprocess ( struct clip_ctx * ctx , const clip_image_u8 * img , clip_image_f32_batch & res_imgs ) {
bool pad_to_square = true ;
2023-10-12 17:23:18 +02:00
if ( ! ctx - > has_vision_encoder ) {
printf ( " This gguf file seems to have no vision encoder \n " ) ;
return false ;
}
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
auto & params = ctx - > vision_model . hparams ;
// The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing
if ( strcmp ( params . mm_patch_merge_type , " spatial_unpad " ) = = 0 ) {
pad_to_square = false ;
}
// free the previous res_imgs if any set
2024-02-15 09:01:57 +01:00
if ( res_imgs . size > 0 ) {
clip_image_f32_batch_free ( res_imgs ) ;
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
}
res_imgs . data = nullptr ;
res_imgs . size = 0 ;
2023-10-12 17:23:18 +02:00
// the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
// see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
2023-12-30 22:24:42 +01:00
clip_image_u8 * temp = clip_image_u8_init ( ) ; // we will keep the input image data here temporarily
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
if ( pad_to_square & & img - > nx ! = img - > ny ) {
2023-10-12 17:23:18 +02:00
int longer_side = std : : max ( img - > nx , img - > ny ) ;
2023-11-06 22:36:23 +01:00
temp - > nx = longer_side ;
temp - > ny = longer_side ;
2023-12-30 22:24:42 +01:00
temp - > buf . resize ( 3 * longer_side * longer_side ) ;
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
const uint8_t bc [ 3 ] = { 122 , 116 , 104 } ; // background color in RGB from LLaVA (this is the mean rgb color * 255)
2023-10-12 17:23:18 +02:00
// fill with background color
2023-12-30 22:24:42 +01:00
for ( size_t i = 0 ; i < temp - > buf . size ( ) ; i + + ) {
temp - > buf [ i ] = bc [ i % 3 ] ;
2023-10-12 17:23:18 +02:00
}
// copy from the input image
for ( int y = 0 ; y < img - > ny ; y + + ) {
for ( int x = 0 ; x < img - > nx ; x + + ) {
const int i = 3 * ( y * img - > nx + x ) ;
2023-11-06 22:36:23 +01:00
const int j = 3 * ( y * temp - > nx + x ) ;
2023-12-30 22:24:42 +01:00
temp - > buf [ j ] = img - > buf [ i ] ;
temp - > buf [ j + 1 ] = img - > buf [ i + 1 ] ;
temp - > buf [ j + 2 ] = img - > buf [ i + 2 ] ;
2023-10-12 17:23:18 +02:00
}
}
} else {
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
if ( params . image_grid_pinpoints [ 0 ] ! = 0 ) {
// "spatial_unpad" with "anyres" processing for llava-1.6
std : : vector < std : : pair < int , int > > possible_resolutions ;
for ( int i = 0 ; i < 32 & & params . image_grid_pinpoints [ i ] ! = 0 ; i + = 2 ) {
possible_resolutions . push_back ( { params . image_grid_pinpoints [ i ] , params . image_grid_pinpoints [ i + 1 ] } ) ;
}
std : : pair < int , int > best_resolution = select_best_resolution ( { img - > nx , img - > ny } , possible_resolutions ) ;
// clip_image_save_to_bmp(*img, "input.bmp");
resize_and_pad_image ( * img , * temp , best_resolution ) ; // we do not pad with mean-bg color anymore in llava-1.6
// clip_image_save_to_bmp(*temp, "resized.bmp");
// visually verify normalized image:
// normalize_image_u8_to_f32(*temp, *res, ctx->image_mean, ctx->image_std);
// {
// clip_image_u8 * temp2 = clip_image_u8_init();
// clip_image_convert_f32_to_u8(*res, *temp2);
// clip_image_save_to_bmp(*temp2, "resized_normalized_f32.bmp");
// clip_image_u8_free(temp2);
// }
std : : vector < clip_image_u8 * > patches = divide_to_patches_u8 ( * temp , params . image_size ) ; // prepare spatial sorted main patches of image_size each (336 in llava-1.6)
clip_image_u8 * image_original_resize = clip_image_u8_init ( ) ;
// bilinear_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
bicubic_resize ( * img , * image_original_resize , params . image_size , params . image_size ) ; // in python this is "shortest_edge", but all CLIP are square
patches . insert ( patches . begin ( ) , image_original_resize ) ;
// clip_image_f32_batch_init(patches.size());
res_imgs . size = patches . size ( ) ;
res_imgs . data = new clip_image_f32 [ res_imgs . size ] ;
int num = 0 ;
for ( auto & patch : patches ) {
normalize_image_u8_to_f32 ( patch , & res_imgs . data [ num ] , ctx - > image_mean , ctx - > image_std ) ;
num + + ;
}
for ( size_t i = 0 ; i < patches . size ( ) ; i + + ) {
// printf("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
clip_image_u8_free ( patches [ i ] ) ;
}
clip_image_u8_free ( temp ) ;
return true ;
} else {
temp - > nx = img - > nx ;
temp - > ny = img - > ny ;
temp - > buf . resize ( img - > buf . size ( ) ) ;
memcpy ( temp - > buf . data ( ) , img - > buf . data ( ) , temp - > buf . size ( ) ) ;
}
2023-10-12 17:23:18 +02:00
}
2023-11-06 22:36:23 +01:00
const int nx = temp - > nx ;
const int ny = temp - > ny ;
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
// clip_image_save_to_bmp(*temp, "resized_vanilla.bmp");
2023-10-12 17:23:18 +02:00
const int nx2 = ctx - > vision_model . hparams . image_size ;
const int ny2 = ctx - > vision_model . hparams . image_size ;
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
clip_image_f32 * res = clip_image_f32_init ( ) ;
2023-10-12 17:23:18 +02:00
res - > nx = nx2 ;
res - > ny = ny2 ;
2023-12-30 22:24:42 +01:00
res - > buf . resize ( 3 * nx2 * ny2 ) ;
2023-10-12 17:23:18 +02:00
const float scale = std : : max ( nx , ny ) / ( float ) ctx - > vision_model . hparams . image_size ;
const int nx3 = int ( nx / scale + 0.5f ) ;
const int ny3 = int ( ny / scale + 0.5f ) ;
const auto & m3 = ctx - > image_mean ; // {0.48145466f, 0.4578275f, 0.40821073f};
const auto & s3 = ctx - > image_std ; // {0.26862954f, 0.26130258f, 0.27577711f};
for ( int y = 0 ; y < ny3 ; y + + ) {
for ( int x = 0 ; x < nx3 ; x + + ) {
for ( int c = 0 ; c < 3 ; c + + ) {
// linear interpolation
const float sx = ( x + 0.5f ) * scale - 0.5f ;
const float sy = ( y + 0.5f ) * scale - 0.5f ;
const int x0 = std : : max ( 0 , ( int ) std : : floor ( sx ) ) ;
const int y0 = std : : max ( 0 , ( int ) std : : floor ( sy ) ) ;
const int x1 = std : : min ( x0 + 1 , nx - 1 ) ;
const int y1 = std : : min ( y0 + 1 , ny - 1 ) ;
const float dx = sx - x0 ;
const float dy = sy - y0 ;
const int j00 = 3 * ( y0 * nx + x0 ) + c ;
const int j01 = 3 * ( y0 * nx + x1 ) + c ;
const int j10 = 3 * ( y1 * nx + x0 ) + c ;
const int j11 = 3 * ( y1 * nx + x1 ) + c ;
2023-12-30 22:24:42 +01:00
const float v00 = temp - > buf [ j00 ] ;
const float v01 = temp - > buf [ j01 ] ;
const float v10 = temp - > buf [ j10 ] ;
const float v11 = temp - > buf [ j11 ] ;
2023-10-12 17:23:18 +02:00
const float v0 = v00 * ( 1.0f - dx ) + v01 * dx ;
const float v1 = v10 * ( 1.0f - dx ) + v11 * dx ;
const float v = v0 * ( 1.0f - dy ) + v1 * dy ;
const uint8_t v2 = std : : min ( std : : max ( std : : round ( v ) , 0.0f ) , 255.0f ) ;
const int i = 3 * ( y * nx3 + x ) + c ;
2023-12-30 22:24:42 +01:00
res - > buf [ i ] = ( ( float ( v2 ) / 255.0f ) - m3 [ c ] ) / s3 [ c ] ;
2023-10-12 17:23:18 +02:00
}
}
}
2023-11-06 22:36:23 +01:00
clip_image_u8_free ( temp ) ;
2023-10-12 17:23:18 +02:00
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
// {
// clip_image_u8 * temp2 = clip_image_u8_init();
// clip_image_convert_f32_to_u8(*res, *temp2);
// clip_image_save_to_bmp(*temp2, "resized_normalized_f32_vanilla.bmp");
// clip_image_u8_free(temp2);
// }
// res_imgs.push_back(res);
res_imgs . size = 1 ;
res_imgs . data = new clip_image_f32 [ res_imgs . size ] ;
2024-02-15 09:01:57 +01:00
res_imgs . data [ 0 ] = * res ;
clip_image_f32_free ( res ) ;
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
2023-10-12 17:23:18 +02:00
return true ;
}
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
ggml_tensor * clip_get_newline_tensor ( const struct clip_ctx * ctx ) {
return ctx - > vision_model . image_newline ;
}
2023-10-12 17:23:18 +02:00
void clip_free ( clip_ctx * ctx ) {
2023-12-30 22:24:42 +01:00
ggml_free ( ctx - > ctx_data ) ;
2023-10-12 17:23:18 +02:00
gguf_free ( ctx - > ctx_gguf ) ;
2023-12-30 22:24:42 +01:00
2023-10-12 17:23:18 +02:00
delete ctx ;
}
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
size_t clip_embd_nbytes ( const struct clip_ctx * ctx ) {
return clip_n_patches ( ctx ) * clip_n_mmproj_embd ( ctx ) * sizeof ( float ) ;
}
int32_t clip_image_size ( const struct clip_ctx * ctx ) {
return ctx - > vision_model . hparams . image_size ;
}
int32_t clip_patch_size ( const struct clip_ctx * ctx ) {
return ctx - > vision_model . hparams . patch_size ;
}
int32_t clip_hidden_size ( const struct clip_ctx * ctx ) {
return ctx - > vision_model . hparams . hidden_size ;
}
const char * clip_patch_merge_type ( const struct clip_ctx * ctx ) {
return ctx - > vision_model . hparams . mm_patch_merge_type ;
}
const int32_t * clip_image_grid ( const struct clip_ctx * ctx ) {
return ctx - > vision_model . hparams . image_grid_pinpoints ;
}
int clip_n_patches ( const struct clip_ctx * ctx ) {
const auto & params = ctx - > vision_model . hparams ;
int n_patches = ( params . image_size / params . patch_size ) * ( params . image_size / params . patch_size ) ;
if ( ctx - > proj_type = = PROJECTOR_TYPE_LDP ) {
n_patches / = 4 ;
}
return n_patches ;
}
2023-12-30 22:24:42 +01:00
bool clip_image_encode ( struct clip_ctx * ctx , const int n_threads , clip_image_f32 * img , float * vec ) {
2023-10-12 17:23:18 +02:00
if ( ! ctx - > has_vision_encoder ) {
printf ( " This gguf file seems to have no vision encoder \n " ) ;
return false ;
}
clip_image_f32_batch imgs { } ;
imgs . size = 1 ;
imgs . data = img ;
return clip_image_batch_encode ( ctx , n_threads , & imgs , vec ) ;
}
2023-12-30 22:24:42 +01:00
bool clip_image_batch_encode ( clip_ctx * ctx , const int n_threads , const clip_image_f32_batch * imgs , float * vec ) {
2023-10-12 17:23:18 +02:00
if ( ! ctx - > has_vision_encoder ) {
printf ( " This gguf file seems to have no vision encoder \n " ) ;
return false ;
}
int batch_size = imgs - > size ;
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
if ( ctx - > has_llava_projector ) {
2023-10-12 17:23:18 +02:00
GGML_ASSERT ( batch_size = = 1 ) ; // TODO: support multiple images
}
// build the inference graph
ggml_cgraph * gf = clip_image_build_graph ( ctx , imgs ) ;
2024-02-12 08:16:06 +01:00
ggml_gallocr_alloc_graph ( ctx - > compute_alloc , gf ) ;
// set inputs
const auto & model = ctx - > vision_model ;
const auto & hparams = model . hparams ;
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
const int image_size = hparams . image_size ;
const int patch_size = hparams . patch_size ;
const int num_patches = ( ( image_size / patch_size ) * ( image_size / patch_size ) ) ;
2024-02-12 08:16:06 +01:00
const int num_positions = num_patches + 1 ;
{
struct ggml_tensor * inp_raw = ggml_graph_get_tensor ( gf , " inp_raw " ) ;
float * data = ( float * ) malloc ( ggml_nbytes ( inp_raw ) ) ;
for ( size_t i = 0 ; i < imgs - > size ; i + + ) {
const int nx = imgs - > data [ i ] . nx ;
const int ny = imgs - > data [ i ] . ny ;
GGML_ASSERT ( nx = = image_size & & ny = = image_size ) ;
const int n = nx * ny ;
for ( int b = 0 ; b < batch_size ; b + + ) {
for ( int k = 0 ; k < 3 ; k + + ) {
for ( int y = 0 ; y < ny ; y + + ) {
for ( int x = 0 ; x < nx ; x + + ) {
data [ ( b * 3 * n ) + k * n + y * nx + x ] = imgs - > data [ b ] . buf [ 3 * ( y * nx + x ) + k ] ;
}
}
}
}
}
ggml_backend_tensor_set ( inp_raw , data , 0 , ggml_nbytes ( inp_raw ) ) ;
free ( data ) ;
}
{
struct ggml_tensor * embeddings = ggml_graph_get_tensor ( gf , " embeddings " ) ;
void * zero_mem = malloc ( ggml_nbytes ( embeddings ) ) ;
memset ( zero_mem , 0 , ggml_nbytes ( embeddings ) ) ;
ggml_backend_tensor_set ( embeddings , zero_mem , 0 , ggml_nbytes ( embeddings ) ) ;
free ( zero_mem ) ;
}
{
struct ggml_tensor * positions = ggml_graph_get_tensor ( gf , " positions " ) ;
int * positions_data = ( int * ) malloc ( ggml_nbytes ( positions ) ) ;
for ( int i = 0 ; i < num_positions ; i + + ) {
positions_data [ i ] = i ;
}
ggml_backend_tensor_set ( positions , positions_data , 0 , ggml_nbytes ( positions ) ) ;
free ( positions_data ) ;
}
{
struct ggml_tensor * patches = ggml_graph_get_tensor ( gf , " patches " ) ;
int * patches_data = ( int * ) malloc ( ggml_nbytes ( patches ) ) ;
for ( int i = 0 ; i < num_patches ; i + + ) {
patches_data [ i ] = i + 1 ;
}
ggml_backend_tensor_set ( patches , patches_data , 0 , ggml_nbytes ( patches ) ) ;
free ( patches_data ) ;
}
2023-12-29 17:52:15 +01:00
if ( ggml_backend_is_cpu ( ctx - > backend ) ) {
ggml_backend_cpu_set_n_threads ( ctx - > backend , n_threads ) ;
}
2023-10-12 17:23:18 +02:00
2023-12-29 17:52:15 +01:00
# ifdef GGML_USE_METAL
if ( ggml_backend_is_metal ( ctx - > backend ) ) {
ggml_backend_metal_set_n_cb ( ctx - > backend , n_threads ) ;
2023-10-12 17:23:18 +02:00
}
2023-12-29 17:52:15 +01:00
# endif
2023-10-12 17:23:18 +02:00
2023-12-29 17:52:15 +01:00
ggml_backend_graph_compute ( ctx - > backend , gf ) ;
2023-10-12 17:23:18 +02:00
// the last node is the embedding tensor
2023-12-29 17:52:15 +01:00
struct ggml_tensor * embeddings = gf - > nodes [ gf - > n_nodes - 1 ] ;
2023-10-12 17:23:18 +02:00
// copy the embeddings to the location passed by the user
2023-12-29 17:52:15 +01:00
ggml_backend_tensor_get ( embeddings , vec , 0 , ggml_nbytes ( embeddings ) ) ;
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
2023-10-12 17:23:18 +02:00
return true ;
}
bool clip_model_quantize ( const char * fname_inp , const char * fname_out , const int itype ) {
ggml_type type = GGML_TYPE_Q4_1 ;
2024-01-10 14:37:09 +01:00
assert ( itype < GGML_TYPE_COUNT ) ;
type = static_cast < ggml_type > ( itype ) ;
2023-10-12 17:23:18 +02:00
2023-12-30 22:24:42 +01:00
auto * ctx_clip = clip_model_load ( fname_inp , 2 ) ;
2023-10-12 17:23:18 +02:00
const auto & ctx_src = ctx_clip - > ctx_gguf ;
2023-12-30 22:24:42 +01:00
const auto & ctx_data = ctx_clip - > ctx_data ;
2023-10-12 17:23:18 +02:00
2023-12-30 22:24:42 +01:00
auto * ctx_out = gguf_init_empty ( ) ;
2023-10-12 17:23:18 +02:00
gguf_set_kv ( ctx_out , ctx_src ) ;
gguf_set_val_u32 ( ctx_out , " general.quantization_version " , GGML_QNT_VERSION ) ;
gguf_set_val_u32 ( ctx_out , " general.file_type " , itype ) ;
auto fout = std : : ofstream ( fname_out , std : : ios : : binary ) ;
const int n_tensors = gguf_get_n_tensors ( ctx_src ) ;
for ( int i = 0 ; i < n_tensors ; + + i ) {
const char * name = gguf_get_tensor_name ( ctx_src , i ) ;
struct ggml_tensor * cur = ggml_get_tensor ( ctx_data , name ) ;
gguf_add_tensor ( ctx_out , cur ) ;
}
const size_t meta_size = gguf_get_meta_size ( ctx_out ) ;
for ( size_t i = 0 ; i < meta_size ; + + i ) {
fout . put ( 0 ) ;
}
// regexes of tensor names to be quantized
const std : : vector < std : : string > k_names = {
" .*weight " ,
} ;
std : : vector < uint8_t > work ( 512 ) ;
std : : vector < float > conv_buf ( 512 ) ;
size_t total_size_org = 0 ;
size_t total_size_new = 0 ;
for ( int i = 0 ; i < n_tensors ; + + i ) {
const std : : string name = gguf_get_tensor_name ( ctx_src , i ) ;
struct ggml_tensor * cur = ggml_get_tensor ( ctx_data , name . c_str ( ) ) ;
enum ggml_type new_type ;
void * new_data ;
size_t new_size ;
bool quantize = false ;
for ( const auto & s : k_names ) {
if ( std : : regex_match ( name , std : : regex ( s ) ) ) {
quantize = true ;
break ;
}
}
// quantize only 2D tensors
2023-12-14 16:52:08 +01:00
quantize & = ( ggml_n_dims ( cur ) = = 2 ) ;
2023-10-12 17:23:18 +02:00
if ( quantize ) {
new_type = type ;
2024-01-10 14:37:09 +01:00
if ( new_type > = GGML_TYPE_Q2_K & & name . find ( " embd " ) ! = std : : string : : npos ) {
new_type = GGML_TYPE_Q8_0 ; // ggml_get_rows needs non K type
// fprintf(stderr, "%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
}
2023-10-12 17:23:18 +02:00
const size_t n_elms = ggml_nelements ( cur ) ;
float * f32_data ;
switch ( cur - > type ) {
case GGML_TYPE_F32 :
f32_data = ( float * ) cur - > data ;
break ;
case GGML_TYPE_F16 :
if ( conv_buf . size ( ) < n_elms ) {
conv_buf . resize ( n_elms ) ;
}
for ( size_t j = 0 ; j < n_elms ; + + j ) {
conv_buf [ j ] = ggml_fp16_to_fp32 ( ( ( ggml_fp16_t * ) cur - > data ) [ j ] ) ;
}
f32_data = ( float * ) conv_buf . data ( ) ;
break ;
default :
printf ( " Please use an input file in f32 or f16 \n " ) ;
2024-03-14 19:29:32 +01:00
gguf_free ( ctx_out ) ;
2023-10-12 17:23:18 +02:00
return false ;
}
if ( work . size ( ) < n_elms * 4 ) {
work . resize ( n_elms * 4 ) ;
}
new_data = work . data ( ) ;
2024-03-09 14:53:59 +01:00
new_size = ggml_quantize_chunk ( new_type , f32_data , new_data , 0 , n_elms / cur - > ne [ 0 ] , cur - > ne [ 0 ] , nullptr ) ;
2023-10-12 17:23:18 +02:00
} else {
new_type = cur - > type ;
new_data = cur - > data ;
new_size = ggml_nbytes ( cur ) ;
}
const size_t orig_size = ggml_nbytes ( cur ) ;
total_size_org + = orig_size ;
total_size_new + = new_size ;
gguf_set_tensor_type ( ctx_out , name . c_str ( ) , new_type ) ;
gguf_set_tensor_data ( ctx_out , name . c_str ( ) , new_data , new_size ) ;
fout . write ( ( const char * ) new_data , new_size ) ;
size_t pad = GGML_PAD ( new_size , gguf_get_alignment ( ctx_out ) ) - new_size ;
for ( size_t j = 0 ; j < pad ; + + j ) {
fout . put ( 0 ) ;
}
2023-12-14 16:52:08 +01:00
printf ( " %s: n_dims = %d | quantize=%d | size = %f MB -> %f MB \n " , name . c_str ( ) , ggml_n_dims ( cur ) , quantize ,
2023-10-12 17:23:18 +02:00
orig_size / 1024.0 / 1024.0 , new_size / 1024.0 / 1024.0 ) ;
}
// go back to beginning of file and write the updated metadata
fout . seekp ( 0 , std : : ios : : beg ) ;
std : : vector < uint8_t > meta ( meta_size ) ;
gguf_get_meta_data ( ctx_out , meta . data ( ) ) ;
fout . write ( ( const char * ) meta . data ( ) , meta_size ) ;
fout . close ( ) ;
clip_free ( ctx_clip ) ;
gguf_free ( ctx_out ) ;
{
2023-12-29 17:52:15 +01:00
printf ( " %s: original size = %8.2f MB \n " , __func__ , total_size_org / 1024.0 / 1024.0 ) ;
printf ( " %s: quantized size = %8.2f MB \n " , __func__ , total_size_new / 1024.0 / 1024.0 ) ;
2023-10-12 17:23:18 +02:00
}
return true ;
}
2023-11-06 22:36:23 +01:00
int clip_n_mmproj_embd ( const struct clip_ctx * ctx ) {
2024-01-22 14:09:35 +01:00
if ( ctx - > proj_type = = PROJECTOR_TYPE_LDP ) {
return ctx - > vision_model . mm_model_block_1_block_2_1_b - > ne [ 0 ] ;
}
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
if ( ctx - > proj_type = = PROJECTOR_TYPE_MLP ) {
2024-01-22 14:09:35 +01:00
return ctx - > vision_model . mm_2_b - > ne [ 0 ] ;
}
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
if ( ctx - > proj_type = = PROJECTOR_TYPE_MLP_NORM ) {
return ctx - > vision_model . mm_3_b - > ne [ 0 ] ;
2024-01-22 14:09:35 +01:00
}
2023-10-12 17:23:18 +02:00
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
std : : string proj_type = PROJECTOR_TYPE_NAMES [ ctx - > proj_type ] ;
throw std : : runtime_error ( format ( " %s: don't support projector with: %s currently \n " , __func__ , proj_type . c_str ( ) ) ) ;
2023-10-12 17:23:18 +02:00
}