llama : throw on unknown tokenizer types

ggml-ci
This commit is contained in:
Georgi Gerganov 2024-05-29 21:06:56 +03:00
parent 21ccd645df
commit 1494a1841e
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735

View File

@ -4595,20 +4595,14 @@ static void llm_load_vocab(
vocab.special_cls_id = 101; vocab.special_cls_id = 101;
vocab.special_mask_id = 103; vocab.special_mask_id = 103;
vocab.add_space_prefix = false; vocab.add_space_prefix = false;
} else { } else if (tokenizer_model == "gpt2") {
if (tokenizer_model == "gpt2") {
vocab.type = LLAMA_VOCAB_TYPE_BPE; vocab.type = LLAMA_VOCAB_TYPE_BPE;
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str()); const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
if (add_space_prefix_keyidx != -1) { if (add_space_prefix_keyidx != -1) {
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx); vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
} }
} else {
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str());
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
vocab.type = LLAMA_VOCAB_TYPE_SPM;
return;
}
// read bpe merges and populate bpe ranks // read bpe merges and populate bpe ranks
const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str()); const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
if (merges_keyidx == -1) { if (merges_keyidx == -1) {
@ -4642,6 +4636,8 @@ static void llm_load_vocab(
vocab.special_pad_id = -1; vocab.special_pad_id = -1;
vocab.special_cls_id = -1; vocab.special_cls_id = -1;
vocab.special_mask_id = -1; vocab.special_mask_id = -1;
} else {
throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
} }
// for now, only BPE models have pre-tokenizers // for now, only BPE models have pre-tokenizers