vocab : correctly identify LF token for GPT-2 style BPE tokenizer (#11496)

2025-02-05 16:10:42 +01:00 · 2025-01-30 11:10:59 +01:00 · 2025-01-30 11:10:59 +01:00 · ffd0821c57
commit ffd0821c57
parent 4314e56c4f
1 changed files with 1 additions and 1 deletions
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@ -1692,7 +1692,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
        GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
        linefeed_id = ids[0];
    } else {
-        const std::vector<int> ids = tokenize("\xC4\x8A", false); // U+010A
+        const std::vector<int> ids = tokenize("\n", false);

        //GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
        if (ids.empty()) {