mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-28 21:07:06 +01:00
llama : UGM tokenizer init with UNK tokens instead of PAD
This commit is contained in:
parent
9eb5d5617d
commit
17bb0eaec3
@ -14890,9 +14890,9 @@ struct llm_tokenizer_ugm {
|
||||
size_t input_len = normalized.size();
|
||||
|
||||
// initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
|
||||
std::vector<struct best_tokenization> tokenization_results(input_len + 1, {0, 0, -FLT_MAX});
|
||||
std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.special_unk_id, 0, -FLT_MAX});
|
||||
// at the beginning tokenization score is zero
|
||||
tokenization_results[0] = { 0, 0, 0 };
|
||||
tokenization_results[0] = { vocab.special_unk_id, 0, 0 };
|
||||
|
||||
for (size_t input_offset = 0; input_offset < input_len;) {
|
||||
size_t prefix_offset = input_offset;
|
||||
@ -14912,7 +14912,7 @@ struct llm_tokenizer_ugm {
|
||||
single_codepoint_token_found = true;
|
||||
}
|
||||
llama_token token_id = node->value;
|
||||
const auto &token_data = vocab.id_to_token[token_id];
|
||||
const auto & token_data = vocab.id_to_token[token_id];
|
||||
|
||||
// we set the user-defined token scores to 0 to make them more likely to be selected
|
||||
// (normal token scores are log probabilities, so they are negative)
|
||||
|
Loading…
Reference in New Issue
Block a user