From aeeb9420a32699f5f55afc5471bcae61bcc8e473 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 9 Jan 2025 17:12:54 +0200 Subject: [PATCH] vocab : minor tokenization optimizations (#11160) ggml-ci Co-authored-by: Diego Devesa --- src/llama-vocab.cpp | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 8f3975c4f..425b9e83f 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -2345,19 +2345,21 @@ std::vector llama_vocab::impl::tokenize( for (const auto & fragment : fragment_buffer) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { - auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); + std::string text; // prefix with space if previous is special if (tokenizer_add_space_prefix && is_prev_special) { - raw_text = " " + raw_text; + text = ' '; } + text += fragment.raw_text.substr(fragment.offset, fragment.length); + #ifdef PRETOKENIZERDEBUG - LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); + LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str()); #endif - llama_escape_whitespace(raw_text); + llama_escape_whitespace(text); llm_tokenizer_spm_session session(vocab); - session.tokenize(raw_text, output); + session.tokenize(text, output); is_prev_special = false; } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) output.push_back(fragment.token); @@ -2387,12 +2389,12 @@ std::vector llama_vocab::impl::tokenize( } for (const auto & fragment : fragment_buffer) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { - auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); + std::string text = fragment.raw_text.substr(fragment.offset, fragment.length); #ifdef PRETOKENIZERDEBUG - LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); + LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str()); #endif - session.tokenize(raw_text, output); + session.tokenize(text, output); } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) session.append(fragment.token, output); } @@ -2414,12 +2416,12 @@ std::vector llama_vocab::impl::tokenize( for (const auto & fragment : fragment_buffer) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { - auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); + std::string text = fragment.raw_text.substr(fragment.offset, fragment.length); #ifdef PRETOKENIZERDEBUG - LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); + LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str()); #endif - session.tokenize(raw_text, output); + session.tokenize(text, output); } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) output.push_back(fragment.token); } @@ -2440,11 +2442,11 @@ std::vector llama_vocab::impl::tokenize( for (const auto & fragment : fragment_buffer) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { - auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); + std::string text = fragment.raw_text.substr(fragment.offset, fragment.length); #ifdef PRETOKENIZERDEBUG - LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); + LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str()); #endif - session.tokenize(raw_text, output); + session.tokenize(text, output); } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) output.push_back(fragment.token); } @@ -2467,13 +2469,13 @@ std::vector llama_vocab::impl::tokenize( llm_tokenizer_rwkv_session session(vocab, *static_cast(tokenizer.get())); for (const auto & fragment : fragment_buffer) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { - auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); + std::string text = fragment.raw_text.substr(fragment.offset, fragment.length); #ifdef PRETOKENIZERDEBUG - LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); + LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str()); #endif - session.tokenize(raw_text, output); + session.tokenize(text, output); } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) output.push_back(fragment.token); }