2024-06-15 18:53:40 +02:00
# include "common.h"
# include "llama.h"
# include "ggml.h"
# include "pca.hpp"
# ifdef GGML_USE_CUDA
# include "ggml-cuda.h"
# endif
# ifdef GGML_USE_METAL
# include "ggml-metal.h"
# endif
# include <cstdio>
# include <string>
# include <tuple>
# include <vector>
# include <algorithm>
# include <iostream>
# include <fstream>
# include <climits>
//////////////////////////////////////////////////
// utils
template < class Iter >
static std : : string tokens_to_str ( llama_context * ctx , Iter begin , Iter end ) {
std : : string ret ;
for ( ; begin ! = end ; + + begin ) {
ret + = llama_token_to_piece ( ctx , * begin ) ;
}
return ret ;
}
static void print_usage ( int argc , char * * argv , const gpt_params & params ) {
gpt_params_print_usage ( argc , argv , params ) ;
printf ( " \n example usage: \n " ) ;
printf ( " \n CPU only: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf \n " , argv [ 0 ] ) ;
printf ( " \n with GPU: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 \n " , argv [ 0 ] ) ;
2024-06-22 18:11:30 +02:00
printf ( " \n advanced: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --pca-batch 100 \n " , argv [ 0 ] ) ;
2024-06-15 18:53:40 +02:00
printf ( " \n " ) ;
}
//////////////////////////////////////////////////
// cb_eval is reused for each pair of positive - negative prompt
struct callback_data {
ggml_context * ctx_ggml = nullptr ; // holds v_pos, v_neg, v_diff_filtered
int n_layers = 0 ;
int n_tokens = 0 ;
bool is_eval_pos = true ;
// each element of the vector correspond to one layer
std : : vector < struct ggml_tensor * > v_pos ; // vector of matrices of size [n_embd, n_tokens]
std : : vector < struct ggml_tensor * > v_neg ; // vector of matrices of size [n_embd, n_tokens]
std : : vector < struct ggml_tensor * > v_diff_filtered ; // vector of matrices of size [n_embd, n_nonzero_rows]. NOTE: n_nonzero_rows maybe different for each layer
// save a tensor into either v_pos or v_neg (decided by is_eval_pos)
void save_tensor_for_layer ( struct ggml_tensor * t ) {
GGML_ASSERT ( t - > type = = GGML_TYPE_F32 ) ;
if ( ctx_ggml = = nullptr ) {
// alloc a new ctx_ggml if needed
struct ggml_init_params params_ggml = {
/*.mem_size =*/ ggml_tensor_overhead ( ) * n_layers * 3u ,
/*.mem_buffer =*/ NULL ,
/*.no_alloc =*/ true ,
} ;
ctx_ggml = ggml_init ( params_ggml ) ;
}
// copy tensor data
auto n_bytes = ggml_nbytes ( t ) ;
struct ggml_tensor * t_layer = ggml_new_tensor_2d ( ctx_ggml , t - > type , t - > ne [ 0 ] , t - > ne [ 1 ] ) ;
t_layer - > data = malloc ( n_bytes ) ; // TODO @ngxson : get rid of this malloc somehow
ggml_backend_tensor_get ( t , t_layer - > data , 0 , n_bytes ) ;
ggml_set_name ( t_layer , ggml_get_name ( t ) ) ;
//print_debug_tensor(t_layer);
if ( is_eval_pos ) {
v_pos . push_back ( t_layer ) ;
} else {
v_neg . push_back ( t_layer ) ;
}
}
// calculate diff (v_pos - v_neg) and place the result back to v_pos
// all zero rows in the diff tensor will also be removed
// NOTE: final layer is ignored. we only have (n_layers - 1) to process
std : : vector < struct ggml_tensor * > calc_diff ( ) {
for ( float il = 0 ; il < v_pos . size ( ) ; il + + ) {
float * a = ( float * ) v_pos [ il ] - > data ;
float * b = ( float * ) v_neg [ il ] - > data ;
size_t n_elem = ggml_nelements ( v_pos [ il ] ) ;
for ( size_t j = 0 ; j < n_elem ; j + + ) {
a [ j ] - = b [ j ] ;
}
//print_debug_tensor(v_pos[i]);
auto diff_filtered = filter_nonzero_rows ( v_pos [ il ] ) ;
v_diff_filtered . push_back ( diff_filtered ) ;
}
return v_diff_filtered ; // for convinient, we return the result std::vector
}
// delete zero rows from a given 2D tensor
struct ggml_tensor * filter_nonzero_rows ( struct ggml_tensor * a ) {
//printf("filter_nonzero_rows\n");
auto is_row_all_zeros = [ ] ( struct ggml_tensor * t , int row , float eps ) - > bool {
// check if given row containing all zero elements
int n_cols = t - > ne [ 0 ] ; // hint: should be equal to n_embd
for ( int col = 0 ; col < n_cols ; + + col ) {
if ( ggml_get_f32_nd ( t , col , row , 0 , 0 ) > eps ) {
return false ;
}
}
return true ;
} ;
std : : vector < int > rows_to_copy ; // the idx of non-zero cols (to be copied to row of diff_filtered)
for ( int i_row = 0 ; i_row < a - > ne [ 1 ] ; i_row + + ) {
if ( ! is_row_all_zeros ( a , i_row , 1e-6 ) ) {
rows_to_copy . push_back ( i_row ) ;
}
}
// get "n_nonzero_rows" for the output "diff_filtered"
int n_nonzero_rows = rows_to_copy . size ( ) ;
//printf("n_nonzero_rows: %d\n", n_nonzero_rows);
int n_embd = a - > ne [ 0 ] ;
GGML_ASSERT ( n_nonzero_rows > 0 ) ;
// diff_filtered: [n_embd, n_nonzero_rows]
struct ggml_tensor * diff_filtered = ggml_new_tensor_2d (
ctx_ggml , GGML_TYPE_F32 , n_embd , n_nonzero_rows ) ;
ggml_format_name ( diff_filtered , " diff_filtered_%s " , a - > name ) ;
diff_filtered - > data = malloc ( ggml_nbytes ( diff_filtered ) ) ;
// copy non-zero rows
for ( int dest_row = 0 ; dest_row < n_nonzero_rows ; dest_row + + ) {
int src_row = rows_to_copy [ dest_row ] ;
for ( int i = 0 ; i < n_embd ; i + + ) {
float src_elem = ggml_get_f32_nd ( a , i , src_row , 0 , 0 ) ;
ggml_set_f32_nd ( diff_filtered , i , dest_row , 0 , 0 , src_elem ) ;
}
}
//print_debug_tensor(diff_filtered);
return diff_filtered ;
}
// we don't implement destructor, because we want to reuse callback_data. we just want to free the tensors
void reset ( ) {
for ( auto ptr : v_pos ) free ( ptr - > data ) ;
for ( auto ptr : v_neg ) free ( ptr - > data ) ;
for ( auto ptr : v_diff_filtered ) free ( ptr - > data ) ;
v_pos . clear ( ) ;
v_neg . clear ( ) ;
v_diff_filtered . clear ( ) ;
if ( ctx_ggml ) {
ggml_free ( ctx_ggml ) ;
}
ctx_ggml = nullptr ;
}
} ;
/**
* process_ctx is used to store the ggml context for pre - post processing the diff vectors
* in short , input = > v_diff and output = > v_final
*/
struct train_context {
ggml_context * ctx_ggml ;
int n_embd ;
int n_layers ;
/* pair of prompts to be used for generating final vector */
std : : vector < std : : string > positive_entries ;
std : : vector < std : : string > negative_entries ;
// each element of the vector correspond to one layer
// NOTE: the last layer is discard. therefore, we will have (n_layers - 1) elements here
// NOTE (2): v_diff is transposed from v_diff_tmp
std : : vector < struct ggml_tensor * > v_diff ; // vector of matrices of size [m, n_embd] where m ~ n_tokens * n_completions (v_diff contains no zero-rows)
std : : vector < struct ggml_tensor * > v_final ; // vector of vectors of size [n_embd] to be written to file
// to easily re-alloc when concat v_diff, we temporary store v_diff in a vector instead of a tensor
// v_diff_tmp will get converted unto v_diff later on
std : : vector < std : : vector < uint8_t > > v_diff_tmp ;
train_context ( int n_embd_ , int n_layers_ ) {
n_embd = n_embd_ ;
n_layers = n_layers_ ;
struct ggml_init_params params_ggml = {
/*.mem_size =*/ ggml_tensor_overhead ( ) * ( n_layers - 1 ) * 2u ,
/*.mem_buffer =*/ NULL ,
/*.no_alloc =*/ true ,
} ;
ctx_ggml = ggml_init ( params_ggml ) ;
for ( int il = 0 ; il < n_layers - 1 ; il + + ) {
std : : vector < uint8_t > empty ;
v_diff_tmp . push_back ( empty ) ;
auto t = ggml_new_tensor_1d ( ctx_ggml , GGML_TYPE_F32 , n_embd ) ;
t - > data = malloc ( ggml_nbytes ( t ) ) ; // TODO: get rid of malloc if possible
v_final . push_back ( t ) ;
}
}
// add new rows into existing tensor in v_diff_tmp
void concat_diff_tmp ( const std : : vector < struct ggml_tensor * > & diff_filtered ) {
GGML_ASSERT ( ( int ) diff_filtered . size ( ) = = n_layers - 1 ) ;
for ( int il = 0 ; il < n_layers - 1 ; il + + ) {
auto t = diff_filtered [ il ] ;
auto & diff_tmp = v_diff_tmp [ il ] ;
size_t curr_size = diff_tmp . size ( ) ;
diff_tmp . resize ( curr_size + ggml_nbytes ( t ) ) ;
memcpy ( diff_tmp . data ( ) + curr_size , t - > data , ggml_nbytes ( t ) ) ;
}
}
// build the v_diff tensors from v_diff_tmp (v_diff need to be transposed)
// TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method
void build_v_diff ( ) {
printf ( " build_v_diff \n " ) ;
for ( int il = 0 ; il < n_layers - 1 ; il + + ) {
auto & diff_tmp = v_diff_tmp [ il ] ;
int n_elem = diff_tmp . size ( ) / sizeof ( float ) ;
GGML_ASSERT ( n_elem % n_embd = = 0 ) ;
int n_rows = n_elem / n_embd ;
struct ggml_tensor * diff = ggml_new_tensor_2d ( ctx_ggml , GGML_TYPE_F32 , n_rows , n_embd ) ;
ggml_set_name ( diff , ( std : : string ( " diff_ " ) + std : : to_string ( il ) ) . c_str ( ) ) ;
// copy data & transpose
diff - > data = malloc ( ggml_nbytes ( diff ) ) ; // TODO: get rid of this malloc if possible
float * arr = ( float * ) diff_tmp . data ( ) ;
for ( int ir = 0 ; ir < n_rows ; + + ir ) {
for ( int ic = 0 ; ic < n_embd ; + + ic ) {
float f = arr [ ir * n_embd + ic ] ;
ggml_set_f32_nd ( diff , ir , ic , 0 , 0 , f ) ;
}
}
v_diff . push_back ( diff ) ;
print_debug_tensor ( diff ) ;
// free memory of diff_tmp
diff_tmp . resize ( 0 ) ;
}
}
~ train_context ( ) {
for ( auto ptr : v_final ) free ( ptr - > data ) ;
for ( auto ptr : v_diff ) free ( ptr - > data ) ;
// no need to free v_diff_tmp, since we didn't use malloc
ggml_free ( ctx_ggml ) ;
}
} ;
struct tokenized_prompt {
std : : vector < llama_token > tokens_pos ;
std : : vector < llama_token > tokens_neg ;
size_t max_seq_len ;
tokenized_prompt ( llama_context * ctx , std : : string pos , std : : string neg ) {
const bool add_bos = llama_should_add_bos_token ( llama_get_model ( ctx ) ) ;
tokens_pos = : : llama_tokenize ( ctx , pos , add_bos ) ;
tokens_neg = : : llama_tokenize ( ctx , neg , add_bos ) ;
max_seq_len = std : : max ( tokens_pos . size ( ) , tokens_neg . size ( ) ) ;
padding_seq ( ctx , tokens_pos , max_seq_len ) ;
padding_seq ( ctx , tokens_neg , max_seq_len ) ;
}
void padding_seq ( llama_context * ctx , std : : vector < llama_token > & tokens , size_t len ) {
// TODO: customize padding token
std : : vector < llama_token > pad_tokens = : : llama_tokenize ( ctx , " " , false ) ;
llama_token pad_tok = pad_tokens . back ( ) ;
while ( tokens . size ( ) < len ) {
tokens . push_back ( pad_tok ) ;
}
}
} ;
//////////////////////////////////////////////////
template < typename T >
static std : : string to_string ( const T & val ) {
std : : stringstream ss ;
ss < < val ;
return ss . str ( ) ;
}
static std : : vector < std : : string > ctrlvec_load_prompt_file ( std : : string path , bool skip_empty_lines ) {
std : : vector < std : : string > output ;
std : : ifstream file ( path ) ;
if ( ! file . is_open ( ) ) {
fprintf ( stderr , " error: unable to open file: %s \n " , path . c_str ( ) ) ;
exit ( 1 ) ;
}
std : : string line ;
while ( std : : getline ( file , line ) ) {
bool is_skip = skip_empty_lines & & line . empty ( ) ;
if ( ! is_skip ) {
string_process_escapes ( line ) ;
output . push_back ( line ) ;
}
}
file . close ( ) ;
return output ;
}
//////////////////////////////////////////////////
static bool cb_eval ( struct ggml_tensor * t , bool ask , void * user_data ) {
auto * cb_data = ( callback_data * ) user_data ;
static const char * l_out_name = " l_out " ;
const bool is_l_out = strncmp ( t - > name , l_out_name , strlen ( l_out_name ) ) = = 0 ;
if ( ask ) {
return is_l_out ;
}
if ( ! is_l_out | | t - > ne [ 1 ] ! = cb_data - > n_tokens ) {
return true ;
}
// save the tensor to current context
cb_data - > save_tensor_for_layer ( t ) ;
return true ;
}
static bool get_hidden_layers ( llama_context * ctx , std : : vector < llama_token > & tokens ) {
llama_kv_cache_clear ( ctx ) ;
if ( llama_decode ( ctx , llama_batch_get_one ( tokens . data ( ) , tokens . size ( ) , 0 , 0 ) ) ) {
fprintf ( stderr , " %s : failed to eval \n " , __func__ ) ;
return false ;
}
return true ;
}
static void export_gguf ( const std : : vector < struct ggml_tensor * > & v_ctrl , const std : : string fname , const std : : string model_hint ) {
struct gguf_context * ctx = gguf_init_empty ( ) ;
const std : : string arch = " controlvector " ;
gguf_set_val_str ( ctx , " general.architecture " , arch . c_str ( ) ) ;
gguf_set_val_str ( ctx , ( arch + " .model_hint " ) . c_str ( ) , model_hint . c_str ( ) ) ;
gguf_set_val_i32 ( ctx , ( arch + " .layer_count " ) . c_str ( ) , v_ctrl . size ( ) ) ;
for ( size_t i = 0 ; i < v_ctrl . size ( ) ; + + i ) {
gguf_add_tensor ( ctx , v_ctrl [ i ] ) ;
print_debug_tensor ( v_ctrl [ i ] ) ;
printf ( " Added tensor: %s \n " , v_ctrl [ i ] - > name ) ;
}
printf ( " %s: writing file... \n " , __func__ ) ;
gguf_write_to_file ( ctx , fname . c_str ( ) , false ) ;
printf ( " %s: wrote file '%s' \n " , __func__ , fname . c_str ( ) ) ;
gguf_free ( ctx ) ;
}
/**
* Load prompt files and completion file .
* Then format each pair of prompt + completion to make an entry .
*/
static int prepare_entries ( gpt_params & params , train_context & ctx_train ) {
// load prompts
std : : vector < std : : string > positive_prompts = ctrlvec_load_prompt_file ( params . cvector_positive_file , true ) ;
std : : vector < std : : string > negative_prompts = ctrlvec_load_prompt_file ( params . cvector_negative_file , true ) ;
if ( positive_prompts . size ( ) ! = negative_prompts . size ( ) ) {
fprintf ( stderr , " number of positive and negative prompts must be equal \n " ) ;
return 1 ;
}
if ( positive_prompts . empty ( ) ) {
fprintf ( stderr , " must provide at least one prompt pair \n " ) ;
return 1 ;
}
// create templated prompts
std : : vector < std : : string > completions = ctrlvec_load_prompt_file ( params . cvector_completions_file , false ) ;
auto format_template = [ ] ( std : : string persona , std : : string suffix ) {
2024-06-22 17:19:37 +02:00
// entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST] "
return persona + suffix ;
2024-06-15 18:53:40 +02:00
} ;
for ( size_t i = 0 ; i < positive_prompts . size ( ) ; + + i ) {
for ( int j = 0 ; j < std : : min ( ( int ) completions . size ( ) , params . n_completions ) ; + + j ) {
// TODO replicate the truncations done by the python implementation
ctx_train . positive_entries . push_back ( format_template ( positive_prompts [ i ] , completions [ j ] ) ) ;
ctx_train . negative_entries . push_back ( format_template ( negative_prompts [ i ] , completions [ j ] ) ) ;
}
}
return 0 ;
}
int main ( int argc , char * * argv ) {
gpt_params params ;
if ( ! gpt_params_parse ( argc , argv , params ) ) {
print_usage ( argc , argv , params ) ;
return 1 ;
}
if ( params . n_pca_iterations % params . n_pca_batch ! = 0 ) {
fprintf ( stderr , " PCA iterations must by multiply of PCA batch size \n " ) ;
return 1 ;
}
callback_data cb_data ;
// pass the callback to the backend scheduler
// it will be executed for each node during the graph computation
params . cb_eval = cb_eval ;
params . cb_eval_user_data = & cb_data ;
params . warmup = false ;
print_build_info ( ) ;
llama_backend_init ( ) ;
llama_numa_init ( params . numa ) ;
// load the model to get hparams
llama_model * model ;
llama_context * ctx ;
std : : tie ( model , ctx ) = llama_init_from_gpt_params ( params ) ;
// int n_ctx = llama_n_ctx(ctx);
int n_layers = llama_n_layer ( model ) ;
int n_embd = llama_n_embd ( model ) ;
// get model hint param (a.k.a model arch name)
char model_hint [ 128 ] ;
llama_model_meta_val_str ( model , " general.architecture " , model_hint , 128 ) ;
// init train_context
train_context ctx_train ( n_embd , n_layers ) ;
// load and prepare entries for training
prepare_entries ( params , ctx_train ) ;
// we have to pretokenize everything because otherwise we don't know how much overhead to allocate ctx_diffs_wrapped
std : : vector < tokenized_prompt > tokenized_prompts ;
size_t n_total_tokens = 0 ;
for ( size_t i = 0 ; i < ctx_train . positive_entries . size ( ) ; + + i ) {
tokenized_prompt t ( ctx , ctx_train . positive_entries [ i ] , ctx_train . negative_entries [ i ] ) ;
n_total_tokens + = 2 * t . max_seq_len ;
tokenized_prompts . push_back ( std : : move ( t ) ) ;
}
std : : cout < < " n_total_tokens: " < < n_total_tokens < < std : : endl ;
for ( size_t i = 0 ; i < ctx_train . positive_entries . size ( ) ; + + i ) {
bool success = false ;
tokenized_prompt t = tokenized_prompts [ i ] ;
cb_data . n_layers = n_layers ;
cb_data . n_tokens = t . max_seq_len ;
printf ( " Evaluating prompt[%d/%d]: \" %s \" - \" %s \" (%d tokens) \n " ,
( int ) i + 1 , ( int ) ctx_train . positive_entries . size ( ) ,
tokens_to_str ( ctx , t . tokens_pos . cbegin ( ) , t . tokens_pos . cend ( ) ) . c_str ( ) ,
tokens_to_str ( ctx , t . tokens_neg . cbegin ( ) , t . tokens_neg . cend ( ) ) . c_str ( ) ,
( int ) t . max_seq_len ) ;
cb_data . is_eval_pos = true ;
success = get_hidden_layers ( ctx , t . tokens_pos ) ;
if ( ! success ) break ;
cb_data . is_eval_pos = false ;
success = get_hidden_layers ( ctx , t . tokens_neg ) ;
if ( ! success ) break ;
// calculate diff and remove all zero rows
auto v_diff_filtered = cb_data . calc_diff ( ) ;
// save & concat the filtered v_diff to ctx_train
ctx_train . concat_diff_tmp ( v_diff_filtered ) ;
// reset for next iteration
cb_data . reset ( ) ;
}
// done with the model, we can now free it to make gain some memory
printf ( " Done evaluate prompts, unload model... \n " ) ;
llama_free ( ctx ) ;
llama_free_model ( model ) ;
// prepare ctx_train for PCA
ctx_train . build_v_diff ( ) ;
// run PCA
PCA : : pca_params pca_params ;
pca_params . n_threads = params . n_threads ;
pca_params . n_batch = params . n_pca_batch ;
pca_params . n_iterations = params . n_pca_iterations ;
PCA : : run_pca ( pca_params , ctx_train . v_diff , ctx_train . v_final ) ;
// write output vectors to gguf
export_gguf ( ctx_train . v_final , params . cvector_outfile , model_hint ) ;
llama_backend_free ( ) ;
return 0 ;
}