mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-26 06:10:29 +01:00
llama : UGM tokenizer init with UNK tokens instead of PAD
This commit is contained in:
parent
9eb5d5617d
commit
17bb0eaec3
@ -14890,9 +14890,9 @@ struct llm_tokenizer_ugm {
|
||||
size_t input_len = normalized.size();
|
||||
|
||||
// initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
|
||||
std::vector<struct best_tokenization> tokenization_results(input_len + 1, {0, 0, -FLT_MAX});
|
||||
std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.special_unk_id, 0, -FLT_MAX});
|
||||
// at the beginning tokenization score is zero
|
||||
tokenization_results[0] = { 0, 0, 0 };
|
||||
tokenization_results[0] = { vocab.special_unk_id, 0, 0 };
|
||||
|
||||
for (size_t input_offset = 0; input_offset < input_len;) {
|
||||
size_t prefix_offset = input_offset;
|
||||
|
Loading…
Reference in New Issue
Block a user