llama : avoid double token-to-piece cache (#7654)

ggml-ci
This commit is contained in:
Georgi Gerganov 2024-06-03 08:34:43 +03:00 committed by GitHub
parent 9e405b6e2e
commit 549279d804
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -2164,8 +2164,7 @@ struct llama_vocab {
std::vector<token_data> id_to_token; std::vector<token_data> id_to_token;
std::vector<id> cache_special_tokens; std::vector<id> cache_special_tokens;
std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = false); std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = true);
std::vector<token> cache_token_to_piece_special; // llama_token_to_piece(special = true);
std::map<std::pair<std::string, std::string>, int> bpe_ranks; std::map<std::pair<std::string, std::string>, int> bpe_ranks;
@ -4845,23 +4844,19 @@ static void llm_load_vocab(
LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size()); LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size());
} }
// build token to piece caches // build token to piece cache
{ {
size_t size_cache = 0; size_t size_cache = 0;
std::vector<llama_vocab::token> cache_token_to_piece (n_vocab); std::vector<llama_vocab::token> cache_token_to_piece(n_vocab);
std::vector<llama_vocab::token> cache_token_to_piece_special(n_vocab);
for (uint32_t id = 0; id < n_vocab; ++id) { for (uint32_t id = 0; id < n_vocab; ++id) {
cache_token_to_piece[id] = llama_token_to_piece(&model, id, false); cache_token_to_piece[id] = llama_token_to_piece(&model, id, true);
cache_token_to_piece_special[id] = llama_token_to_piece(&model, id, true);
size_cache += cache_token_to_piece[id].size(); size_cache += cache_token_to_piece[id].size();
size_cache += cache_token_to_piece_special[id].size();
} }
std::swap(vocab.cache_token_to_piece, cache_token_to_piece); std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
std::swap(vocab.cache_token_to_piece_special, cache_token_to_piece_special);
LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0); LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
} }
@ -18318,9 +18313,14 @@ static std::string llama_decode_text(const std::string & text) {
// does not write null-terminator to buf // does not write null-terminator to buf
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) { int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
// ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
if (!special && llama_is_control_token(model->vocab, token)) {
return 0;
}
// if we have a cache - use it // if we have a cache - use it
{ {
const auto & cache = special ? model->vocab.cache_token_to_piece_special : model->vocab.cache_token_to_piece; const auto & cache = model->vocab.cache_token_to_piece;
if (!cache.empty()) { if (!cache.empty()) {
const auto & res = cache.at(token); const auto & res = cache.at(token);