vocab : correctly identify LF token for GPT-2 style BPE tokenizer (#11496)

This commit is contained in:
mgroeber9110 2025-01-30 11:10:59 +01:00 committed by GitHub
parent 4314e56c4f
commit ffd0821c57
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1692,7 +1692,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
linefeed_id = ids[0];
} else {
const std::vector<int> ids = tokenize("\xC4\x8A", false); // U+010A
const std::vector<int> ids = tokenize("\n", false);
//GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
if (ids.empty()) {