From 45095a61bfd164e87563a0dc0fbd7b0e9891590b Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 31 Dec 2024 15:22:01 +0100 Subject: [PATCH] server : clean up built-in template detection (#11026) * server : clean up built-in template detection * fix compilation * add chat template test * fix condition --- common/common.cpp | 12 ++++++++++ common/common.h | 3 +++ examples/server/server.cpp | 23 ++++++++----------- .../server/tests/unit/test_chat_completion.py | 17 ++++++++++++++ examples/server/tests/utils.py | 3 +++ examples/server/utils.hpp | 13 ----------- 6 files changed, 44 insertions(+), 27 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 9071999a7..fe923fce6 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1614,6 +1614,18 @@ std::string common_detokenize(llama_context * ctx, const std::vector 0) { + std::vector model_template(res + 1, 0); + llama_model_meta_val_str(model, template_key, model_template.data(), model_template.size()); + return std::string(model_template.data(), model_template.size() - 1); + } + return ""; +} + bool common_chat_verify_template(const std::string & tmpl) { llama_chat_message chat[] = {{"user", "test"}}; int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0); diff --git a/common/common.h b/common/common.h index 1d2bd932c..589f65d09 100644 --- a/common/common.h +++ b/common/common.h @@ -571,6 +571,9 @@ struct common_chat_msg { std::string content; }; +// Get the built-in chat template for the model. Return empty string if not present. +std::string common_get_builtin_chat_template(const struct llama_model * model); + // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid bool common_chat_verify_template(const std::string & tmpl); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 1d00954a2..b3773f276 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1724,17 +1724,10 @@ struct server_context { return true; } - bool validate_model_chat_template() const { - std::vector model_template(2048, 0); // longest known template is about 1200 bytes - std::string template_key = "tokenizer.chat_template"; - int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size()); - if (res >= 0) { - llama_chat_message chat[] = {{"user", "test"}}; - std::string tmpl = std::string(model_template.data(), model_template.size()); - int32_t chat_res = llama_chat_apply_template(model, tmpl.c_str(), chat, 1, true, nullptr, 0); - return chat_res > 0; - } - return false; + bool validate_builtin_chat_template() const { + llama_chat_message chat[] = {{"user", "test"}}; + int32_t chat_res = llama_chat_apply_template(model, nullptr, chat, 1, true, nullptr, 0); + return chat_res > 0; } void init() { @@ -3583,7 +3576,7 @@ int main(int argc, char ** argv) { { "default_generation_settings", ctx_server.default_generation_settings_for_props }, { "total_slots", ctx_server.params_base.n_parallel }, { "model_path", ctx_server.params_base.model }, - { "chat_template", llama_get_chat_template(ctx_server.model) }, + { "chat_template", common_get_builtin_chat_template(ctx_server.model) }, { "build_info", build_info }, }; @@ -4223,14 +4216,16 @@ int main(int argc, char ** argv) { // if a custom chat template is not supplied, we will use the one that comes with the model (if any) if (params.chat_template.empty()) { - if (!ctx_server.validate_model_chat_template()) { + if (!ctx_server.validate_builtin_chat_template()) { LOG_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__); params.chat_template = "chatml"; } } // print sample chat example to make it clear which template is used - LOG_INF("%s: chat template, built_in: %d, chat_example: '%s'\n", __func__, params.chat_template.empty(), common_chat_format_example(ctx_server.model, params.chat_template).c_str()); + LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__, + params.chat_template.empty() ? "(built-in)" : params.chat_template.c_str(), + common_chat_format_example(ctx_server.model, params.chat_template).c_str()); ctx_server.queue_tasks.on_new_task(std::bind( &server_context::process_single_task, &ctx_server, std::placeholders::_1)); diff --git a/examples/server/tests/unit/test_chat_completion.py b/examples/server/tests/unit/test_chat_completion.py index 130da03a1..b15dba6eb 100644 --- a/examples/server/tests/unit/test_chat_completion.py +++ b/examples/server/tests/unit/test_chat_completion.py @@ -100,6 +100,23 @@ def test_chat_completion_with_openai_library(): assert match_regex("(Suddenly)+", res.choices[0].message.content) +def test_chat_template(): + global server + server.chat_template = "llama3" + server.debug = True # to get the "__verbose" object in the response + server.start() + res = server.make_request("POST", "/chat/completions", data={ + "max_tokens": 8, + "messages": [ + {"role": "system", "content": "Book"}, + {"role": "user", "content": "What is the best book"}, + ] + }) + assert res.status_code == 200 + assert "__verbose" in res.body + assert res.body["__verbose"]["prompt"] == " <|start_header_id|>system<|end_header_id|>\n\nBook<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is the best book<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" + + @pytest.mark.parametrize("response_format,n_predicted,re_content", [ ({"type": "json_object", "schema": {"const": "42"}}, 6, "\"42\""), ({"type": "json_object", "schema": {"items": [{"type": "integer"}]}}, 10, "[ -3000 ]"), diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py index 277125e88..359bb0fae 100644 --- a/examples/server/tests/utils.py +++ b/examples/server/tests/utils.py @@ -74,6 +74,7 @@ class ServerProcess: draft_min: int | None = None draft_max: int | None = None no_webui: bool | None = None + chat_template: str | None = None # session variables process: subprocess.Popen | None = None @@ -164,6 +165,8 @@ class ServerProcess: server_args.extend(["--draft-min", self.draft_min]) if self.no_webui: server_args.append("--no-webui") + if self.chat_template: + server_args.extend(["--chat-template", self.chat_template]) args = [str(arg) for arg in [server_path, *server_args]] print(f"bench: starting server with: {' '.join(args)}") diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 8523d4787..70220c437 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -382,19 +382,6 @@ inline std::string format_chat(const struct llama_model * model, const std::stri return formatted_chat; } -static std::string llama_get_chat_template(const struct llama_model * model) { - std::string template_key = "tokenizer.chat_template"; - // call with NULL buffer to get the total size of the string - int32_t res = llama_model_meta_val_str(model, template_key.c_str(), NULL, 0); - if (res < 2) { - return ""; - } else { - std::vector model_template(res + 1, 0); - llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size()); - return std::string(model_template.data(), model_template.size() - 1); - } -} - // // base64 utils (TODO: move to common in the future) //