mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-27 20:43:07 +01:00
llama : fix mpt and olmo pre-tokenizer
This commit is contained in:
parent
1c5eba6f8e
commit
db2ffd519d
@ -5170,6 +5170,28 @@ static void llm_load_vocab(
|
|||||||
vocab.token_to_id[word] = i;
|
vocab.token_to_id[word] = i;
|
||||||
vocab.max_token_len = std::max(vocab.max_token_len, (int) word.size());
|
vocab.max_token_len = std::max(vocab.max_token_len, (int) word.size());
|
||||||
|
|
||||||
|
// TODO: properly handle pre-normalized added_tokens and remove this
|
||||||
|
// handle space tokens with dual tokens,
|
||||||
|
// like the pre-normalized added_tokens
|
||||||
|
// of neox-style tokenizers (mpt, olmo, stablelm, etc)
|
||||||
|
if (word.find(' ') != std::string::npos) {
|
||||||
|
// same as in the internal `unicode_byte_encoding_process`
|
||||||
|
// TODO: extract and expose this in some unicode_* function
|
||||||
|
std::string text_utf;
|
||||||
|
auto utf_word = unicode_cpts_from_utf8(word);
|
||||||
|
for (size_t i = 0; i < utf_word.size(); ++i) {
|
||||||
|
text_utf += unicode_cpt_to_utf8(utf_word[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string encoded_token;
|
||||||
|
for (char & c : text_utf) {
|
||||||
|
encoded_token += unicode_byte_to_utf8(c);
|
||||||
|
}
|
||||||
|
|
||||||
|
// override token id
|
||||||
|
vocab.token_to_id[encoded_token] = i;
|
||||||
|
}
|
||||||
|
|
||||||
auto & token_data = vocab.id_to_token[i];
|
auto & token_data = vocab.id_to_token[i];
|
||||||
token_data.text = std::move(word);
|
token_data.text = std::move(word);
|
||||||
token_data.score = scores ? scores[i] : 0.0f;
|
token_data.score = scores ? scores[i] : 0.0f;
|
||||||
@ -13890,13 +13912,9 @@ struct llm_tokenizer_bpe {
|
|||||||
};
|
};
|
||||||
break;
|
break;
|
||||||
case LLAMA_VOCAB_PRE_TYPE_MPT:
|
case LLAMA_VOCAB_PRE_TYPE_MPT:
|
||||||
// TODO: MPT pre-tokenization regexes are unknown
|
case LLAMA_VOCAB_PRE_TYPE_OLMO:
|
||||||
// the following are close, but not exact. run the following:
|
|
||||||
// ./bin/test-tokenizer-0 ../models/ggml-vocab-mpt.gguf
|
|
||||||
GGML_ASSERT("MPT pre-tokenization regexes are unknown - fixes needed");
|
|
||||||
regex_exprs = {
|
regex_exprs = {
|
||||||
"\\s?\\p{L}+",
|
"[ ]{2,24}", // the spaces from the added_tokens are split separately
|
||||||
"\\s?\\p{P}+",
|
|
||||||
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
||||||
};
|
};
|
||||||
break;
|
break;
|
||||||
@ -13909,7 +13927,6 @@ struct llm_tokenizer_bpe {
|
|||||||
};
|
};
|
||||||
break;
|
break;
|
||||||
case LLAMA_VOCAB_PRE_TYPE_GPT2:
|
case LLAMA_VOCAB_PRE_TYPE_GPT2:
|
||||||
case LLAMA_VOCAB_PRE_TYPE_OLMO:
|
|
||||||
regex_exprs = {
|
regex_exprs = {
|
||||||
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
||||||
};
|
};
|
||||||
@ -13985,6 +14002,10 @@ struct llm_tokenizer_bpe {
|
|||||||
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
||||||
int final_prev_index = -1;
|
int final_prev_index = -1;
|
||||||
|
|
||||||
|
// FIXME: pre-tokenize added_tokens (user-defined tokens) before other pre-tokenization
|
||||||
|
// ref: https://github.com/huggingface/tokenizers/blob/fdd26ba9a3f0c133427aab0423888cbde91362d7/tokenizers/src/tokenizer/mod.rs#L726
|
||||||
|
// (useful for neox-style tokenizers)
|
||||||
|
|
||||||
const auto word_collection = unicode_regex_split(text, regex_exprs);
|
const auto word_collection = unicode_regex_split(text, regex_exprs);
|
||||||
|
|
||||||
symbols_final.clear();
|
symbols_final.clear();
|
||||||
|
Loading…
Reference in New Issue
Block a user