mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-25 13:58:46 +01:00
llama : allow raw byte in SPM vocabs; don't crash on nl 404 (#5478)
* common : don't crash if newline token is not found * common : llama_byte_to_token: allow falling back to finding just the token byte in SPM vocabs
This commit is contained in:
parent
037259be68
commit
c4e6dd59e4
15
llama.cpp
15
llama.cpp
@ -3314,7 +3314,12 @@ static void llm_load_vocab(
|
|||||||
|
|
||||||
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
||||||
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
||||||
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
|
try {
|
||||||
|
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
|
||||||
|
} catch (const std::exception & e) {
|
||||||
|
LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
|
||||||
|
vocab.linefeed_id = vocab.special_pad_id;
|
||||||
|
}
|
||||||
} else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
|
} else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
|
||||||
vocab.linefeed_id = vocab.special_pad_id;
|
vocab.linefeed_id = vocab.special_pad_id;
|
||||||
} else {
|
} else {
|
||||||
@ -7746,7 +7751,13 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
|
|||||||
switch (llama_vocab_get_type(vocab)) {
|
switch (llama_vocab_get_type(vocab)) {
|
||||||
case LLAMA_VOCAB_TYPE_SPM: {
|
case LLAMA_VOCAB_TYPE_SPM: {
|
||||||
const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
|
const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
|
||||||
return vocab.token_to_id.at(buf);
|
auto token = vocab.token_to_id.find(buf);
|
||||||
|
if (token != vocab.token_to_id.end()) {
|
||||||
|
return (*token).second;
|
||||||
|
}
|
||||||
|
// Try to fall back to just the byte as a string
|
||||||
|
const char buf2[2] = { (char)ch, 0 };
|
||||||
|
return vocab.token_to_id.at(buf2);
|
||||||
}
|
}
|
||||||
case LLAMA_VOCAB_TYPE_WPM:
|
case LLAMA_VOCAB_TYPE_WPM:
|
||||||
case LLAMA_VOCAB_TYPE_BPE: {
|
case LLAMA_VOCAB_TYPE_BPE: {
|
||||||
|
Loading…
Reference in New Issue
Block a user