#include "common.h" #include "llama.h" #include "ggml.h" #include "pca.hpp" #ifdef GGML_USE_CUDA #include "ggml-cuda.h" #endif #ifdef GGML_USE_METAL #include "ggml-metal.h" #endif #include #include #include #include #include #include #include #include ////////////////////////////////////////////////// // utils template static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { std::string ret; for (; begin != end; ++begin) { ret += llama_token_to_piece(ctx, *begin); } return ret; } static void print_usage(int argc, char ** argv, const gpt_params & params) { gpt_params_print_usage(argc, argv, params); printf("\nexample usage:\n"); printf("\n CPU only: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf\n", argv[0]); printf("\n with GPU: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99\n", argv[0]); printf("\n advanced: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --batch-pca 100\n", argv[0]); printf("\n"); } ////////////////////////////////////////////////// // cb_eval is reused for each pair of positive - negative prompt struct callback_data { ggml_context * ctx_ggml = nullptr; // holds v_pos, v_neg, v_diff_filtered int n_layers = 0; int n_tokens = 0; bool is_eval_pos = true; // each element of the vector correspond to one layer std::vector v_pos; // vector of matrices of size [n_embd, n_tokens] std::vector v_neg; // vector of matrices of size [n_embd, n_tokens] std::vector v_diff_filtered; // vector of matrices of size [n_embd, n_nonzero_rows]. NOTE: n_nonzero_rows maybe different for each layer // save a tensor into either v_pos or v_neg (decided by is_eval_pos) void save_tensor_for_layer(struct ggml_tensor * t) { GGML_ASSERT(t->type == GGML_TYPE_F32); if (ctx_ggml == nullptr) { // alloc a new ctx_ggml if needed struct ggml_init_params params_ggml = { /*.mem_size =*/ ggml_tensor_overhead() * n_layers * 3u, /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; ctx_ggml = ggml_init(params_ggml); } // copy tensor data auto n_bytes = ggml_nbytes(t); struct ggml_tensor * t_layer = ggml_new_tensor_2d(ctx_ggml, t->type, t->ne[0], t->ne[1]); t_layer->data = malloc(n_bytes); // TODO @ngxson : get rid of this malloc somehow ggml_backend_tensor_get(t, t_layer->data, 0, n_bytes); ggml_set_name(t_layer, ggml_get_name(t)); //print_debug_tensor(t_layer); if (is_eval_pos) { v_pos.push_back(t_layer); } else { v_neg.push_back(t_layer); } } // calculate diff (v_pos - v_neg) and place the result back to v_pos // all zero rows in the diff tensor will also be removed // NOTE: final layer is ignored. we only have (n_layers - 1) to process std::vector calc_diff() { for (float il = 0; il < v_pos.size(); il++) { float * a = (float *) v_pos[il]->data; float * b = (float *) v_neg[il]->data; size_t n_elem = ggml_nelements(v_pos[il]); for (size_t j = 0; j < n_elem; j++) { a[j] -= b[j]; } //print_debug_tensor(v_pos[i]); auto diff_filtered = filter_nonzero_rows(v_pos[il]); v_diff_filtered.push_back(diff_filtered); } return v_diff_filtered; // for convinient, we return the result std::vector } // delete zero rows from a given 2D tensor struct ggml_tensor * filter_nonzero_rows(struct ggml_tensor * a) { //printf("filter_nonzero_rows\n"); auto is_row_all_zeros = [](struct ggml_tensor * t, int row, float eps) -> bool { // check if given row containing all zero elements int n_cols = t->ne[0]; // hint: should be equal to n_embd for (int col = 0; col < n_cols; ++col) { if (ggml_get_f32_nd(t, col, row, 0, 0) > eps) { return false; } } return true; }; std::vector rows_to_copy; // the idx of non-zero cols (to be copied to row of diff_filtered) for (int i_row = 0; i_row < a->ne[1]; i_row++) { if (!is_row_all_zeros(a, i_row, 1e-6)) { rows_to_copy.push_back(i_row); } } // get "n_nonzero_rows" for the output "diff_filtered" int n_nonzero_rows = rows_to_copy.size(); //printf("n_nonzero_rows: %d\n", n_nonzero_rows); int n_embd = a->ne[0]; GGML_ASSERT(n_nonzero_rows > 0); // diff_filtered: [n_embd, n_nonzero_rows] struct ggml_tensor * diff_filtered = ggml_new_tensor_2d( ctx_ggml, GGML_TYPE_F32, n_embd, n_nonzero_rows); ggml_format_name(diff_filtered, "diff_filtered_%s", a->name); diff_filtered->data = malloc(ggml_nbytes(diff_filtered)); // copy non-zero rows for (int dest_row = 0; dest_row < n_nonzero_rows; dest_row++) { int src_row = rows_to_copy[dest_row]; for (int i = 0; i < n_embd; i++) { float src_elem = ggml_get_f32_nd(a, i, src_row, 0, 0); ggml_set_f32_nd(diff_filtered, i, dest_row, 0, 0, src_elem); } } //print_debug_tensor(diff_filtered); return diff_filtered; } // we don't implement destructor, because we want to reuse callback_data. we just want to free the tensors void reset() { for (auto ptr : v_pos) free(ptr->data); for (auto ptr : v_neg) free(ptr->data); for (auto ptr : v_diff_filtered) free(ptr->data); v_pos.clear(); v_neg.clear(); v_diff_filtered.clear(); if (ctx_ggml) { ggml_free(ctx_ggml); } ctx_ggml = nullptr; } }; /** * process_ctx is used to store the ggml context for pre-post processing the diff vectors * in short, input => v_diff and output => v_final */ struct train_context { ggml_context * ctx_ggml; int n_embd; int n_layers; /* pair of prompts to be used for generating final vector */ std::vector positive_entries; std::vector negative_entries; // each element of the vector correspond to one layer // NOTE: the last layer is discard. therefore, we will have (n_layers - 1) elements here // NOTE (2): v_diff is transposed from v_diff_tmp std::vector v_diff; // vector of matrices of size [m, n_embd] where m ~ n_tokens * n_completions (v_diff contains no zero-rows) std::vector v_final; // vector of vectors of size [n_embd] to be written to file // to easily re-alloc when concat v_diff, we temporary store v_diff in a vector instead of a tensor // v_diff_tmp will get converted unto v_diff later on std::vector> v_diff_tmp; train_context(int n_embd_, int n_layers_) { n_embd = n_embd_; n_layers = n_layers_; struct ggml_init_params params_ggml = { /*.mem_size =*/ ggml_tensor_overhead() * (n_layers - 1) * 2u, /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; ctx_ggml = ggml_init(params_ggml); for (int il = 0; il < n_layers - 1; il++) { std::vector empty; v_diff_tmp.push_back(empty); auto t = ggml_new_tensor_1d(ctx_ggml, GGML_TYPE_F32, n_embd); t->data = malloc(ggml_nbytes(t)); // TODO: get rid of malloc if possible v_final.push_back(t); } } // add new rows into existing tensor in v_diff_tmp void concat_diff_tmp(const std::vector & diff_filtered) { GGML_ASSERT((int) diff_filtered.size() == n_layers - 1); for (int il = 0; il < n_layers - 1; il++) { auto t = diff_filtered[il]; auto & diff_tmp = v_diff_tmp[il]; size_t curr_size = diff_tmp.size(); diff_tmp.resize(curr_size + ggml_nbytes(t)); memcpy(diff_tmp.data() + curr_size, t->data, ggml_nbytes(t)); } } // build the v_diff tensors from v_diff_tmp (v_diff need to be transposed) // TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method void build_v_diff() { printf("build_v_diff\n"); for (int il = 0; il < n_layers - 1; il++) { auto & diff_tmp = v_diff_tmp[il]; int n_elem = diff_tmp.size() / sizeof(float); GGML_ASSERT(n_elem % n_embd == 0); int n_rows = n_elem / n_embd; struct ggml_tensor * diff = ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd); ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str()); // copy data & transpose diff->data = malloc(ggml_nbytes(diff)); // TODO: get rid of this malloc if possible float * arr = (float *) diff_tmp.data(); for (int ir = 0; ir < n_rows; ++ir) { for (int ic = 0; ic < n_embd; ++ic) { float f = arr[ir*n_embd + ic]; ggml_set_f32_nd(diff, ir, ic, 0, 0, f); } } v_diff.push_back(diff); print_debug_tensor(diff); // free memory of diff_tmp diff_tmp.resize(0); } } ~train_context() { for (auto ptr : v_final) free(ptr->data); for (auto ptr : v_diff) free(ptr->data); // no need to free v_diff_tmp, since we didn't use malloc ggml_free(ctx_ggml); } }; struct tokenized_prompt { std::vector tokens_pos; std::vector tokens_neg; size_t max_seq_len; tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) { const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); tokens_pos = ::llama_tokenize(ctx, pos, add_bos); tokens_neg = ::llama_tokenize(ctx, neg, add_bos); max_seq_len = std::max(tokens_pos.size(), tokens_neg.size()); padding_seq(ctx, tokens_pos, max_seq_len); padding_seq(ctx, tokens_neg, max_seq_len); } void padding_seq(llama_context * ctx, std::vector & tokens, size_t len) { // TODO: customize padding token std::vector pad_tokens = ::llama_tokenize(ctx, " ", false); llama_token pad_tok = pad_tokens.back(); while (tokens.size() < len) { tokens.push_back(pad_tok); } } }; ////////////////////////////////////////////////// template static std::string to_string(const T & val) { std::stringstream ss; ss << val; return ss.str(); } static std::vector ctrlvec_load_prompt_file(std::string path, bool skip_empty_lines) { std::vector output; std::ifstream file(path); if (!file.is_open()) { fprintf(stderr, "error: unable to open file: %s\n", path.c_str()); exit(1); } std::string line; while (std::getline(file, line)) { bool is_skip = skip_empty_lines && line.empty(); if (!is_skip) { string_process_escapes(line); output.push_back(line); } } file.close(); return output; } ////////////////////////////////////////////////// static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) { auto * cb_data = (callback_data *) user_data; static const char * l_out_name = "l_out"; const bool is_l_out = strncmp(t->name, l_out_name, strlen(l_out_name)) == 0; if (ask) { return is_l_out; } if (!is_l_out || t->ne[1] != cb_data->n_tokens) { return true; } // save the tensor to current context cb_data->save_tensor_for_layer(t); return true; } static bool get_hidden_layers(llama_context * ctx, std::vector & tokens) { llama_kv_cache_clear(ctx); if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) { fprintf(stderr, "%s : failed to eval\n", __func__); return false; } return true; } static void export_gguf(const std::vector & v_ctrl, const std::string fname, const std::string model_hint) { struct gguf_context * ctx = gguf_init_empty(); const std::string arch = "controlvector"; gguf_set_val_str(ctx, "general.architecture", arch.c_str()); gguf_set_val_str(ctx, (arch + ".model_hint").c_str(), model_hint.c_str()); gguf_set_val_i32(ctx, (arch + ".layer_count").c_str(), v_ctrl.size()); for (size_t i = 0; i < v_ctrl.size(); ++i) { gguf_add_tensor(ctx, v_ctrl[i]); print_debug_tensor(v_ctrl[i]); printf("Added tensor: %s\n", v_ctrl[i]->name); } printf("%s: writing file...\n", __func__); gguf_write_to_file(ctx, fname.c_str(), false); printf("%s: wrote file '%s'\n", __func__, fname.c_str()); gguf_free(ctx); } /** * Load prompt files and completion file. * Then format each pair of prompt + completion to make an entry. */ static int prepare_entries(gpt_params & params, train_context & ctx_train) { // load prompts std::vector positive_prompts = ctrlvec_load_prompt_file(params.cvector_positive_file, true); std::vector negative_prompts = ctrlvec_load_prompt_file(params.cvector_negative_file, true); if (positive_prompts.size() != negative_prompts.size()) { fprintf(stderr, "number of positive and negative prompts must be equal\n"); return 1; } if (positive_prompts.empty()) { fprintf(stderr, "must provide at least one prompt pair\n"); return 1; } // create templated prompts std::vector completions = ctrlvec_load_prompt_file(params.cvector_completions_file, false); auto format_template = [](std::string persona, std::string suffix) { // entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST]" return persona + " " + suffix; }; for (size_t i = 0; i < positive_prompts.size(); ++i) { for (int j = 0; j < std::min((int) completions.size(), params.n_completions); ++j) { // TODO replicate the truncations done by the python implementation ctx_train.positive_entries.push_back(format_template(positive_prompts[i], completions[j])); ctx_train.negative_entries.push_back(format_template(negative_prompts[i], completions[j])); } } return 0; } int main(int argc, char ** argv) { gpt_params params; if (!gpt_params_parse(argc, argv, params)) { print_usage(argc, argv, params); return 1; } if (params.n_pca_iterations % params.n_pca_batch != 0) { fprintf(stderr, "PCA iterations must by multiply of PCA batch size\n"); return 1; } callback_data cb_data; // pass the callback to the backend scheduler // it will be executed for each node during the graph computation params.cb_eval = cb_eval; params.cb_eval_user_data = &cb_data; params.warmup = false; print_build_info(); llama_backend_init(); llama_numa_init(params.numa); // load the model to get hparams llama_model * model; llama_context * ctx; std::tie(model, ctx) = llama_init_from_gpt_params(params); // int n_ctx = llama_n_ctx(ctx); int n_layers = llama_n_layer(model); int n_embd = llama_n_embd(model); // get model hint param (a.k.a model arch name) char model_hint[128]; llama_model_meta_val_str(model, "general.architecture", model_hint, 128); // init train_context train_context ctx_train(n_embd, n_layers); // load and prepare entries for training prepare_entries(params, ctx_train); // we have to pretokenize everything because otherwise we don't know how much overhead to allocate ctx_diffs_wrapped std::vector tokenized_prompts; size_t n_total_tokens = 0; for (size_t i = 0; i < ctx_train.positive_entries.size(); ++i) { tokenized_prompt t(ctx, ctx_train.positive_entries[i], ctx_train.negative_entries[i]); n_total_tokens += 2 * t.max_seq_len; tokenized_prompts.push_back(std::move(t)); } std::cout << "n_total_tokens: " << n_total_tokens << std::endl; for(size_t i = 0; i < ctx_train.positive_entries.size(); ++i) { bool success = false; tokenized_prompt t = tokenized_prompts[i]; cb_data.n_layers = n_layers; cb_data.n_tokens = t.max_seq_len; printf("Evaluating prompt[%d/%d]: \"%s\" - \"%s\" (%d tokens)\n", (int) i+1, (int) ctx_train.positive_entries.size(), tokens_to_str(ctx, t.tokens_pos.cbegin(), t.tokens_pos.cend()).c_str(), tokens_to_str(ctx, t.tokens_neg.cbegin(), t.tokens_neg.cend()).c_str(), (int) t.max_seq_len); cb_data.is_eval_pos = true; success = get_hidden_layers(ctx, t.tokens_pos); if (!success) break; cb_data.is_eval_pos = false; success = get_hidden_layers(ctx, t.tokens_neg); if (!success) break; // calculate diff and remove all zero rows auto v_diff_filtered = cb_data.calc_diff(); // save & concat the filtered v_diff to ctx_train ctx_train.concat_diff_tmp(v_diff_filtered); // reset for next iteration cb_data.reset(); } // done with the model, we can now free it to make gain some memory printf("Done evaluate prompts, unload model...\n"); llama_free(ctx); llama_free_model(model); // prepare ctx_train for PCA ctx_train.build_v_diff(); // run PCA PCA::pca_params pca_params; pca_params.n_threads = params.n_threads; pca_params.n_batch = params.n_pca_batch; pca_params.n_iterations = params.n_pca_iterations; PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final); // write output vectors to gguf export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint); llama_backend_free(); return 0; }