From 40f74e4d739e9250431cf339ae7588b28d8d0663 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 21 Apr 2024 18:36:45 +0300 Subject: [PATCH] llama : add option to render special/control tokens (#6807) * make : fix common dep on llama.h * llama : add option to render special tokens * readme : add API change notice ggml-ci * swift : fix build --- Makefile | 2 +- README.md | 1 + common/common.cpp | 4 +-- examples/batched.swift/Sources/main.swift | 5 ++-- .../llama.cpp.swift/LibLlama.swift | 4 +-- llama.cpp | 25 ++++++++++--------- llama.h | 4 ++- 7 files changed, 25 insertions(+), 20 deletions(-) diff --git a/Makefile b/Makefile index 760015f29..72fdc6ba4 100644 --- a/Makefile +++ b/Makefile @@ -699,7 +699,7 @@ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h $(CXX) $(CXXFLAGS) -c $< -o $@ -COMMON_H_DEPS = common/common.h common/sampling.h common/log.h +COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o common.o: common/common.cpp $(COMMON_H_DEPS) diff --git a/README.md b/README.md index a5ba710d7..1d4e9d417 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) ### Recent API changes +- [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807 - [2024 Apr 4] State and session file functions reorganized under `llama_state_*` https://github.com/ggerganov/llama.cpp/pull/6341 - [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122 - [2024 Mar 13] Add `llama_synchronize()` + `llama_context_params.n_ubatch` https://github.com/ggerganov/llama.cpp/pull/6017 diff --git a/common/common.cpp b/common/common.cpp index b6143e41c..06f252ea6 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2328,10 +2328,10 @@ std::vector llama_tokenize( std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) { std::vector result(8, 0); - const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size()); + const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), true); if (n_tokens < 0) { result.resize(-n_tokens); - int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size()); + int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), true); GGML_ASSERT(check == -n_tokens); } else { result.resize(n_tokens); diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift index 5764acb6d..dbbd06da5 100644 --- a/examples/batched.swift/Sources/main.swift +++ b/examples/batched.swift/Sources/main.swift @@ -229,7 +229,7 @@ private func tokenize(text: String, add_bos: Bool) -> [llama_token] { private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? { var result = [CChar](repeating: 0, count: 8) - let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count)) + let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count), false) if nTokens < 0 { let actualTokensCount = -Int(nTokens) result = .init(repeating: 0, count: actualTokensCount) @@ -237,7 +237,8 @@ private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String model, token, &result, - Int32(result.count) + Int32(result.count), + false ) assert(check == actualTokensCount) } else { diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift index 70c43a385..737f882fb 100644 --- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift +++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift @@ -322,7 +322,7 @@ actor LlamaContext { defer { result.deallocate() } - let nTokens = llama_token_to_piece(model, token, result, 8) + let nTokens = llama_token_to_piece(model, token, result, 8, false) if nTokens < 0 { let newResult = UnsafeMutablePointer.allocate(capacity: Int(-nTokens)) @@ -330,7 +330,7 @@ actor LlamaContext { defer { newResult.deallocate() } - let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens) + let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, false) let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens)) return Array(bufferPointer) } else { diff --git a/llama.cpp b/llama.cpp index ec4c1242b..7440c740f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1600,12 +1600,12 @@ struct llama_mlock { }; using llama_mlocks = std::vector>; -static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) { +static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) { std::vector result(8, 0); - const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size()); + const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special); if (n_tokens < 0) { result.resize(-n_tokens); - int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size()); + int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special); GGML_ASSERT(check == -n_tokens); } else { @@ -13312,7 +13312,8 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c for (size_t i = 0; i < candidates->size; ++i) { const llama_token id = candidates->data[i].id; - const std::string piece = llama_token_to_piece(ctx, id); + const std::string piece = llama_token_to_piece(ctx, id, false); + if (llama_token_is_eog(&ctx->model, id)) { if (!allow_eog) { candidates->data[i].logit = -INFINITY; @@ -13512,7 +13513,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar GGML_ASSERT(false); } - const std::string piece = llama_token_to_piece(ctx, token); + const std::string piece = llama_token_to_piece(ctx, token, false); // Note terminating 0 in decoded string const auto decoded = decode_utf8(piece, grammar->partial_utf8); @@ -16991,7 +16992,7 @@ static std::string llama_decode_text(const std::string & text) { } // does not write null-terminator to buf -int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) { +int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) { if (0 <= token && token < llama_n_vocab(model)) { switch (llama_vocab_get_type(model->vocab)) { case LLAMA_VOCAB_TYPE_WPM: @@ -17006,7 +17007,9 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token } memcpy(buf, result.c_str(), result.length()); return result.length(); - } else if (llama_is_user_defined_token(model->vocab, token)) { + } else if ( + (llama_is_user_defined_token(model->vocab, token)) || + (llama_is_control_token (model->vocab, token) && special)) { std::string result = model->vocab.id_to_token[token].text; if (length < (int) result.length()) { return -(int) result.length(); @@ -17019,8 +17022,6 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token } memcpy(buf, "\xe2\x96\x85", 3); return 3; - } else if (llama_is_control_token(model->vocab, token)) { - ; } else if (llama_is_byte_token(model->vocab, token)) { if (length < 1) { return -1; @@ -17041,15 +17042,15 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token } memcpy(buf, result.c_str(), result.length()); return result.length(); - } else if (llama_is_user_defined_token(model->vocab, token)) { + } else if ( + (llama_is_user_defined_token(model->vocab, token)) || + (llama_is_control_token (model->vocab, token) && special)) { std::string result = model->vocab.id_to_token[token].text; if (length < (int) result.length()) { return -(int) result.length(); } memcpy(buf, result.c_str(), result.length()); return result.length(); - } else if (llama_is_control_token(model->vocab, token)) { - ; } break; } diff --git a/llama.h b/llama.h index 5bed97ad1..4effca42c 100644 --- a/llama.h +++ b/llama.h @@ -828,11 +828,13 @@ extern "C" { // Uses the vocabulary in the provided context. // Does not write null terminator to the buffer. // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens. + // @param special If true, special tokens are rendered in the output. LLAMA_API int32_t llama_token_to_piece( const struct llama_model * model, llama_token token, char * buf, - int32_t length); + int32_t length, + bool special); /// Apply chat template. Inspired by hf apply_chat_template() on python. /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"