llama : UGM tokenizer init with UNK tokens instead of PAD

This commit is contained in:
Georgi Gerganov 2024-07-02 10:40:14 +03:00
parent 9eb5d5617d
commit 17bb0eaec3
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735

View File

@ -14890,9 +14890,9 @@ struct llm_tokenizer_ugm {
size_t input_len = normalized.size(); size_t input_len = normalized.size();
// initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores // initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
std::vector<struct best_tokenization> tokenization_results(input_len + 1, {0, 0, -FLT_MAX}); std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.special_unk_id, 0, -FLT_MAX});
// at the beginning tokenization score is zero // at the beginning tokenization score is zero
tokenization_results[0] = { 0, 0, 0 }; tokenization_results[0] = { vocab.special_unk_id, 0, 0 };
for (size_t input_offset = 0; input_offset < input_len;) { for (size_t input_offset = 0; input_offset < input_len;) {
size_t prefix_offset = input_offset; size_t prefix_offset = input_offset;