From 727368c60f2ebf2d6a7473a4a9f80957ab063a8e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 6 Jan 2025 10:52:15 +0200 Subject: [PATCH] llama : use LLAMA_TOKEN_NULL (#11062) ggml-ci --- common/common.cpp | 2 +- common/ngram-cache.cpp | 24 +++++++------- common/ngram-cache.h | 4 +-- examples/batched/batched.cpp | 2 +- .../convert-llama2c-to-ggml.cpp | 4 +-- examples/main/main.cpp | 4 +-- examples/server/utils.hpp | 2 +- include/llama.h | 1 - src/llama-model.cpp | 32 +++++++++---------- src/llama-sampling.cpp | 8 ++--- src/llama-vocab.cpp | 24 +++++++------- 11 files changed, 53 insertions(+), 54 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 4bb140ee2..d6a7ab753 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -982,7 +982,7 @@ struct common_init_result common_init_from_params(common_params & params) { if (llama_model_has_encoder(model)) { llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size())); llama_token decoder_start_token_id = llama_model_decoder_start_token(model); - if (decoder_start_token_id == -1) { + if (decoder_start_token_id == LLAMA_TOKEN_NULL) { decoder_start_token_id = bos; } tmp.clear(); diff --git a/common/ngram-cache.cpp b/common/ngram-cache.cpp index a9dfb6714..a057ae45f 100644 --- a/common/ngram-cache.cpp +++ b/common/ngram-cache.cpp @@ -65,13 +65,13 @@ constexpr int draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66}; static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) { common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static); if (part_static_it == nc_static.end()) { - return -1; + return LLAMA_TOKEN_NULL; } const common_ngram_cache_part part_static = part_static_it->second; int max_count_static = 0; int sum_count_static = 0; - llama_token max_token = -1; + llama_token max_token = LLAMA_TOKEN_NULL; for (std::pair token_count_static : part_static) { const llama_token token = token_count_static.first; @@ -85,10 +85,10 @@ static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram } if (sum_count_static < draft_min_sample_size_lax[LLAMA_NGRAM_STATIC-1]) { - return -1; + return LLAMA_TOKEN_NULL; } if (100*max_count_static < draft_min_percent_lax[LLAMA_NGRAM_STATIC-1]*sum_count_static) { - return -1; + return LLAMA_TOKEN_NULL; } return max_token; } @@ -98,9 +98,9 @@ static llama_token try_draft( common_ngram_cache & nc_primary, const std::vector & ngrams_primary, common_ngram_cache_part & part_static, const int * min_sample_size, const int * min_percent) { - llama_token drafted_token = -1; + llama_token drafted_token = LLAMA_TOKEN_NULL; - for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) { + for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == LLAMA_TOKEN_NULL; --i) { const common_ngram ngram_primary = ngrams_primary[i]; common_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary); @@ -112,7 +112,7 @@ static llama_token try_draft( int max_count_primary = 0; int max_count_static = 0; int sum_count_primary = 0; - llama_token max_token = -1; + llama_token max_token = LLAMA_TOKEN_NULL; for (std::pair token_count_primary : part_primary) { const llama_token token = token_count_primary.first; @@ -154,7 +154,7 @@ void common_ngram_cache_draft( } while ((int) draft.size()-1 < n_draft) { - llama_token drafted_token = -1; + llama_token drafted_token = LLAMA_TOKEN_NULL; const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1; common_ngram ngram_static; @@ -177,17 +177,17 @@ void common_ngram_cache_draft( } ngrams_cd.push_back(ngram_cd); } - if (drafted_token == -1) { + if (drafted_token == LLAMA_TOKEN_NULL) { drafted_token = try_draft(nc_context, ngrams_cd, part_static, draft_min_sample_size_lax, draft_min_percent_lax); } - if (drafted_token == -1) { + if (drafted_token == LLAMA_TOKEN_NULL) { drafted_token = try_draft(nc_dynamic, ngrams_cd, part_static, draft_min_sample_size_strict, draft_min_percent_strict); } - if (drafted_token == -1) { + if (drafted_token == LLAMA_TOKEN_NULL) { drafted_token = try_draft(nc_static, ngram_static); } - if (drafted_token == -1) { + if (drafted_token == LLAMA_TOKEN_NULL) { break; } diff --git a/common/ngram-cache.h b/common/ngram-cache.h index 09c2b0319..dfe012abe 100644 --- a/common/ngram-cache.h +++ b/common/ngram-cache.h @@ -17,13 +17,13 @@ struct common_ngram { common_ngram() { for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { - tokens[i] = -1; + tokens[i] = LLAMA_TOKEN_NULL; } } common_ngram(const llama_token * input, const int ngram_size) { for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { - tokens[i] = i < ngram_size ? input[i] : -1; + tokens[i] = i < ngram_size ? input[i] : LLAMA_TOKEN_NULL; } } diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index e2e01f2d5..2e25b62f6 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -120,7 +120,7 @@ int main(int argc, char ** argv) { } llama_token decoder_start_token_id = llama_model_decoder_start_token(model); - if (decoder_start_token_id == -1) { + if (decoder_start_token_id == LLAMA_TOKEN_NULL) { decoder_start_token_id = llama_token_bos(model); } diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index 736035d78..9c3a0c367 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -689,8 +689,8 @@ static void save_as_llama_model( gguf_set_val_u32(ctx, KV_TOKENIZER_UNK_ID, UNKNOWN_TOKEN_ID); gguf_set_val_u32(ctx, KV_TOKENIZER_BOS_ID, BOS_TOKEN_ID); gguf_set_val_u32(ctx, KV_TOKENIZER_EOS_ID, EOS_TOKEN_ID); - gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, -1); - gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, -1); + gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, LLAMA_TOKEN_NULL); + gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, LLAMA_TOKEN_NULL); gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, model->hparams.n_ctx); gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index b5e477f5b..aaee47e32 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -494,7 +494,7 @@ int main(int argc, char ** argv) { } llama_token decoder_start_token_id = llama_model_decoder_start_token(model); - if (decoder_start_token_id == -1) { + if (decoder_start_token_id == LLAMA_TOKEN_NULL) { decoder_start_token_id = llama_token_bos(model); } @@ -831,7 +831,7 @@ int main(int argc, char ** argv) { // if user stop generation mid-way, we must add EOT to finish model's last response if (need_insert_eot && format_chat) { llama_token eot = llama_token_eot(model); - embd_inp.push_back(eot == -1 ? llama_token_eos(model) : eot); + embd_inp.push_back(eot == LLAMA_TOKEN_NULL ? llama_token_eos(model) : eot); need_insert_eot = false; } diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index dc6e6e67e..ad130d490 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -507,7 +507,7 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { // format incomplete utf-8 multibyte character for output static std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token) { - std::string out = token == -1 ? "" : common_token_to_piece(ctx, token); + std::string out = token == LLAMA_TOKEN_NULL ? "" : common_token_to_piece(ctx, token); // if the size is 1 and first bit is 1, meaning it's a partial character // (size > 1 meaning it's already a known token) diff --git a/include/llama.h b/include/llama.h index a0d5ba5dd..0f619aa19 100644 --- a/include/llama.h +++ b/include/llama.h @@ -34,7 +34,6 @@ #define LLAMA_DEFAULT_SEED 0xFFFFFFFF -// TODO: use everywhere in the implementation #define LLAMA_TOKEN_NULL -1 #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla' diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 405e0528f..22596499a 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1923,24 +1923,24 @@ void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str()); // special tokens - if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); } - if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); } - if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); } - if (vocab.special_eom_id != -1) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, vocab.special_eom_id, vocab.id_to_token[vocab.special_eom_id].text.c_str() ); } - if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); } - if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); } - if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); } - if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); } - if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); } + if (vocab.special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); } + if (vocab.special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); } + if (vocab.special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); } + if (vocab.special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, vocab.special_eom_id, vocab.id_to_token[vocab.special_eom_id].text.c_str() ); } + if (vocab.special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); } + if (vocab.special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); } + if (vocab.special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); } + if (vocab.special_cls_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); } + if (vocab.special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); } - if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); } + if (vocab.linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); } - if (vocab.special_fim_pre_id != -1) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, vocab.special_fim_pre_id, vocab.id_to_token[vocab.special_fim_pre_id].text.c_str() ); } - if (vocab.special_fim_suf_id != -1) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, vocab.special_fim_suf_id, vocab.id_to_token[vocab.special_fim_suf_id].text.c_str() ); } - if (vocab.special_fim_mid_id != -1) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, vocab.special_fim_mid_id, vocab.id_to_token[vocab.special_fim_mid_id].text.c_str() ); } - if (vocab.special_fim_pad_id != -1) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, vocab.special_fim_pad_id, vocab.id_to_token[vocab.special_fim_pad_id].text.c_str() ); } - if (vocab.special_fim_rep_id != -1) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, vocab.special_fim_rep_id, vocab.id_to_token[vocab.special_fim_rep_id].text.c_str() ); } - if (vocab.special_fim_sep_id != -1) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, vocab.special_fim_sep_id, vocab.id_to_token[vocab.special_fim_sep_id].text.c_str() ); } + if (vocab.special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, vocab.special_fim_pre_id, vocab.id_to_token[vocab.special_fim_pre_id].text.c_str() ); } + if (vocab.special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, vocab.special_fim_suf_id, vocab.id_to_token[vocab.special_fim_suf_id].text.c_str() ); } + if (vocab.special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, vocab.special_fim_mid_id, vocab.id_to_token[vocab.special_fim_mid_id].text.c_str() ); } + if (vocab.special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, vocab.special_fim_pad_id, vocab.id_to_token[vocab.special_fim_pad_id].text.c_str() ); } + if (vocab.special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, vocab.special_fim_rep_id, vocab.id_to_token[vocab.special_fim_rep_id].text.c_str() ); } + if (vocab.special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, vocab.special_fim_sep_id, vocab.id_to_token[vocab.special_fim_sep_id].text.c_str() ); } for (const auto & id : vocab.special_eog_ids) { LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, vocab.id_to_token[id].text.c_str() ); diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 69cea2f14..ef5a576cc 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -257,7 +257,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) for (int i = 0; i < (int)cur_p->size; ++i) { const float val = cur_p->data[i].logit; int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low); - ib = std::max(0, std::min(nbuckets-1, ib)); + ib = std::max(0, std::min(nbuckets - 1, ib)); bucket_idx[i] = ib; ++histo[ib]; } @@ -280,13 +280,13 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) for (int i = 0; i < (int)cur_p->size; ++i) { int j = bucket_idx[i]; if (j >= ib) { - *bucket_ptrs[nbuckets-1-j]++ = cur_p->data[i]; + *bucket_ptrs[nbuckets - 1 - j]++ = cur_p->data[i]; } } ptr = tmp_tokens.data(); int ndone = 0; - for (int j = nbuckets-1; j > ib; --j) { + for (int j = nbuckets - 1; j > ib; --j) { std::sort(ptr, ptr + histo[j], comp); ptr += histo[j]; ndone += histo[j]; @@ -1832,7 +1832,7 @@ static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_dat ctx->dry_repeat_count[last - k] = std::min(n, rep_limit); if (n > 0) { lt = k; - rt = k+n-1; + rt = k + n - 1; } } else { // If k is inside the current Z-box, consider two cases. diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 3fcfcaa3f..a4c015484 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -497,7 +497,7 @@ struct llm_tokenizer_bpe_session { bool append_bos(std::vector & output) const { if (vocab.tokenizer_add_bos) { - GGML_ASSERT(vocab.special_bos_id != -1); + GGML_ASSERT(vocab.special_bos_id != LLAMA_TOKEN_NULL); output.push_back(vocab.special_bos_id); return true; } @@ -506,7 +506,7 @@ struct llm_tokenizer_bpe_session { bool append_eos(std::vector & output) const { if (vocab.tokenizer_add_eos) { - GGML_ASSERT(vocab.special_eos_id != -1); + GGML_ASSERT(vocab.special_eos_id != LLAMA_TOKEN_NULL); output.push_back(vocab.special_eos_id); return true; } @@ -1403,7 +1403,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list< if (source == 0) { buffer.erase_after(buffer.before_begin()); } else { - buffer.erase_after(std::next(buffer.begin(), (source-1))); + buffer.erase_after(std::next(buffer.begin(), (source - 1))); } // repeat for the right side @@ -1417,7 +1417,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list< if (source == 0) { buffer.erase_after(buffer.before_begin()); } else { - buffer.erase_after(std::next(buffer.begin(), (source-1))); + buffer.erase_after(std::next(buffer.begin(), (source - 1))); } break; } @@ -1454,7 +1454,7 @@ std::vector llama_tokenize_internal( bool is_prev_special = true; // prefix with space if first token if (add_special && vocab.tokenizer_add_bos) { - GGML_ASSERT(vocab.special_bos_id != -1); + GGML_ASSERT(vocab.special_bos_id != LLAMA_TOKEN_NULL); output.push_back(vocab.special_bos_id); is_prev_special = true; } @@ -1489,7 +1489,7 @@ std::vector llama_tokenize_internal( } if (add_special && vocab.tokenizer_add_eos) { - GGML_ASSERT(vocab.special_eos_id != -1); + GGML_ASSERT(vocab.special_eos_id != LLAMA_TOKEN_NULL); output.push_back(vocab.special_eos_id); } } break; @@ -1522,7 +1522,7 @@ std::vector llama_tokenize_internal( case LLAMA_VOCAB_TYPE_WPM: { if (add_special) { - GGML_ASSERT(vocab.special_cls_id != -1); + GGML_ASSERT(vocab.special_cls_id != LLAMA_TOKEN_NULL); output.push_back(vocab.special_cls_id); } @@ -1542,14 +1542,14 @@ std::vector llama_tokenize_internal( } if (add_special) { - GGML_ASSERT(vocab.special_sep_id != -1); + GGML_ASSERT(vocab.special_sep_id != LLAMA_TOKEN_NULL); output.push_back(vocab.special_sep_id); } } break; case LLAMA_VOCAB_TYPE_UGM: { if (add_special && vocab.tokenizer_add_bos) { - GGML_ASSERT(vocab.special_bos_id != -1); + GGML_ASSERT(vocab.special_bos_id != LLAMA_TOKEN_NULL); output.push_back(vocab.special_bos_id); } llm_tokenizer_ugm_session session(vocab); @@ -1574,7 +1574,7 @@ std::vector llama_tokenize_internal( } if (add_special && vocab.tokenizer_add_eos) { - GGML_ASSERT(vocab.special_eos_id != -1); + GGML_ASSERT(vocab.special_eos_id != LLAMA_TOKEN_NULL); output.push_back(vocab.special_eos_id); } } break; @@ -1642,7 +1642,7 @@ llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, lla } bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) { - return token != -1 && vocab.special_eog_ids.count(token) > 0; + return token != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(token) > 0; } bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token token) { @@ -1881,7 +1881,7 @@ int32_t llama_detokenize_impl( } if (remove_special && vocab.tokenizer_add_eos) { - if (n_tokens > 0 && tokens[n_tokens-1] == vocab.special_eos_id) { + if (n_tokens > 0 && tokens[n_tokens - 1] == vocab.special_eos_id) { n_tokens--; } }