From 17bb0eaec3585159eb15e843bf7611005de9b098 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 2 Jul 2024 10:40:14 +0300
Subject: [PATCH] llama : UGM tokenizer init with UNK tokens instead of PAD

---
 src/llama.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/src/llama.cpp b/src/llama.cpp
index 7cdd27d86..2a1227c72 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -14890,9 +14890,9 @@ struct llm_tokenizer_ugm {
         size_t input_len = normalized.size();
 
         // initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
-        std::vector<struct best_tokenization> tokenization_results(input_len + 1, {0, 0, -FLT_MAX});
+        std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.special_unk_id, 0, -FLT_MAX});
         // at the beginning tokenization score is zero
-        tokenization_results[0] = { 0, 0, 0 };
+        tokenization_results[0] = { vocab.special_unk_id, 0, 0 };
 
         for (size_t input_offset = 0; input_offset < input_len;) {
             size_t prefix_offset = input_offset;
@@ -14912,7 +14912,7 @@ struct llm_tokenizer_ugm {
                         single_codepoint_token_found = true;
                     }
                     llama_token token_id = node->value;
-                    const auto &token_data = vocab.id_to_token[token_id];
+                    const auto & token_data = vocab.id_to_token[token_id];
 
                     // we set the user-defined token scores to 0 to make them more likely to be selected
                     // (normal token scores are log probabilities, so they are negative)