llama : UGM tokenizer init with UNK tokens instead of PAD

2024-12-26 06:10:29 +01:00 · 2024-07-02 10:40:14 +03:00 · 2024-07-02 10:40:14 +03:00 · 17bb0eaec3
commit 17bb0eaec3
parent 9eb5d5617d
1 changed files with 3 additions and 3 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -14890,9 +14890,9 @@ struct llm_tokenizer_ugm {
        size_t input_len = normalized.size();

        // initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
-        std::vector<struct best_tokenization> tokenization_results(input_len + 1, {0, 0, -FLT_MAX});
+        std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.special_unk_id, 0, -FLT_MAX});
        // at the beginning tokenization score is zero
-        tokenization_results[0] = { 0, 0, 0 };
+        tokenization_results[0] = { vocab.special_unk_id, 0, 0 };

        for (size_t input_offset = 0; input_offset < input_len;) {
            size_t prefix_offset = input_offset;