#pragma once #include "llama.h" #include <unordered_map> #include <string> #include <vector> #define LLAMA_NGRAM_MIN 1 #define LLAMA_NGRAM_MAX 4 #define LLAMA_NGRAM_STATIC 2 // Data structures to map n-grams to empirical token probabilities: struct llama_ngram { llama_token tokens[LLAMA_NGRAM_MAX]; llama_ngram() { for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { tokens[i] = -1; } } llama_ngram(const llama_token * input, const int ngram_size) { for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { tokens[i] = i < ngram_size ? input[i] : -1; } } bool operator==(const llama_ngram & other) const { for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { if (tokens[i] != other.tokens[i]) { return false; } } return true; } }; struct llama_ngram_hash_function { size_t operator()(const llama_ngram & ngram) const { size_t hash = 0; for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { hash ^= std::hash<llama_token>{}(ngram.tokens[i]); } return hash; } }; // token -> number of times token has been seen typedef std::unordered_map<llama_token, int32_t> llama_ngram_cache_part; // n-gram -> empirical distribution of following tokens typedef std::unordered_map<llama_ngram, llama_ngram_cache_part, llama_ngram_hash_function> llama_ngram_cache; // Update an ngram cache with tokens. // ngram_cache: the cache to modify. // ngram_min/ngram_max: the min/max size of the ngrams to extract from inp_data. // inp_data: the token sequence with which to update ngram_cache. // nnew: how many new tokens have been appended to inp_data since the last call to this function. // print_progress: whether to print progress to stderr. // // In order to get correct results inp_data can ONLY BE APPENDED TO. // Changes in the middle need a complete rebuild. void llama_ngram_cache_update( llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress); // Try to draft tokens from ngram caches. // inp: the tokens generated so far. // draft: the token sequence to draft. Expected to initially contain the previously sampled token. // n_draft: maximum number of tokens to add to draft. // ngram_min/gram_max: the min/max size of the ngrams in nc_context and nc_dynamic. // nc_context: ngram cache based on current context. // nc_dynamic: ngram cache based on previous user generations. // nc_static: ngram cache generated from a large text corpus, used for validation. void llama_ngram_cache_draft( std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max, llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static); // Save an ngram cache to a file. // ngram_cache: the ngram cache to save. // filename: the path under which to save the ngram cache. void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename); // Load an ngram cache saved with llama_ngram_cache_save. // filename: the path from which to load the ngram cache. // returns: an ngram cache containing the information saved to filename. llama_ngram_cache llama_ngram_cache_load(std::string & filename); // Merge two ngram caches. // ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add. // ngram_cache_add: the ngram cache to add to ngram_cache_target. void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add);