From c4e6dd59e45ef7b14f7763fb073b517395dc176c Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Tue, 13 Feb 2024 18:18:16 +0200 Subject: [PATCH] llama : allow raw byte in SPM vocabs; don't crash on nl 404 (#5478) * common : don't crash if newline token is not found * common : llama_byte_to_token: allow falling back to finding just the token byte in SPM vocabs --- llama.cpp | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/llama.cpp b/llama.cpp index 61c695187..8ebbf7628 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3314,7 +3314,12 @@ static void llm_load_vocab( // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n' if (vocab.type == LLAMA_VOCAB_TYPE_SPM) { - vocab.linefeed_id = llama_byte_to_token(vocab, '\n'); + try { + vocab.linefeed_id = llama_byte_to_token(vocab, '\n'); + } catch (const std::exception & e) { + LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what()); + vocab.linefeed_id = vocab.special_pad_id; + } } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) { vocab.linefeed_id = vocab.special_pad_id; } else { @@ -7746,7 +7751,13 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) { switch (llama_vocab_get_type(vocab)) { case LLAMA_VOCAB_TYPE_SPM: { const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 }; - return vocab.token_to_id.at(buf); + auto token = vocab.token_to_id.find(buf); + if (token != vocab.token_to_id.end()) { + return (*token).second; + } + // Try to fall back to just the byte as a string + const char buf2[2] = { (char)ch, 0 }; + return vocab.token_to_id.at(buf2); } case LLAMA_VOCAB_TYPE_WPM: case LLAMA_VOCAB_TYPE_BPE: {