From ffd0821c57edc7e5d04338ab0c6b1461198df15f Mon Sep 17 00:00:00 2001
From: mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com>
Date: Thu, 30 Jan 2025 11:10:59 +0100
Subject: [PATCH] vocab : correctly identify LF token for GPT-2 style BPE
 tokenizer (#11496)

---
 src/llama-vocab.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 561f8bdb8..ad9ffe66a 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1692,7 +1692,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
         GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
         linefeed_id = ids[0];
     } else {
-        const std::vector<int> ids = tokenize("\xC4\x8A", false); // U+010A
+        const std::vector<int> ids = tokenize("\n", false);
 
         //GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
         if (ids.empty()) {