server : add system_fingerprint to chat/completion (#10917)

* server : add system_fingerprint to chat/completion * update README
2024-12-27 06:39:25 +01:00 · 2024-12-23 12:02:44 +01:00 · 2024-12-23 12:02:44 +01:00 · 485dc01214
commit 485dc01214
parent 86bf31cfe6
4 changed files with 25 additions and 15 deletions
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -724,7 +724,8 @@ This endpoint is public (no API key check). By default, it is read-only. To make
  },
  "total_slots": 1,
  "model_path": "../models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf",
-  "chat_template": "..."
+  "chat_template": "...",
+  "build_info": "b(build number)-(build commit hash)"
 }
 ```

--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -598,6 +598,7 @@ struct server_task_result_cmpl_final : server_task_result {
            {"choices",            json::array({choice})},
            {"created",            t},
            {"model",              oaicompat_model},
+            {"system_fingerprint", build_info},
            {"object",             "chat.completion"},
            {"usage", json {
                {"completion_tokens", n_decoded},
@ -636,6 +637,7 @@ struct server_task_result_cmpl_final : server_task_result {
            {"created",            t},
            {"id",                 oaicompat_cmpl_id},
            {"model",              oaicompat_model},
+            {"system_fingerprint", build_info},
            {"object",             "chat.completion.chunk"},
            {"usage", json {
                {"completion_tokens", n_decoded},
@ -765,6 +767,7 @@ struct server_task_result_cmpl_partial : server_task_result {
            {"created",            t},
            {"id",                 oaicompat_cmpl_id},
            {"model",              oaicompat_model},
+            {"system_fingerprint", build_info},
            {"object",             "chat.completion.chunk"}
        };

@ -3476,6 +3479,7 @@ int main(int argc, char ** argv) {
            { "total_slots",                 ctx_server.params_base.n_parallel },
            { "model_path",                  ctx_server.params_base.model },
            { "chat_template",               llama_get_chat_template(ctx_server.model) },
+            { "build_info",                  build_info },
        };

        res_ok(res, data);
--- a/examples/server/tests/unit/test_chat_completion.py
+++ b/examples/server/tests/unit/test_chat_completion.py
@ -31,6 +31,7 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte
    })
    assert res.status_code == 200
    assert "cmpl" in res.body["id"] # make sure the completion id has the expected format
+    assert res.body["system_fingerprint"].startswith("b")
    assert res.body["model"] == model if model is not None else server.model_alias
    assert res.body["usage"]["prompt_tokens"] == n_prompt
    assert res.body["usage"]["completion_tokens"] == n_predicted
@ -63,6 +64,7 @@ def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_conte
    last_cmpl_id = None
    for data in res:
        choice = data["choices"][0]
+        assert data["system_fingerprint"].startswith("b")
        assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future
        if last_cmpl_id is None:
            last_cmpl_id = data["id"]
@ -92,6 +94,7 @@ def test_chat_completion_with_openai_library():
        seed=42,
        temperature=0.8,
    )
+    assert res.system_fingerprint is not None and res.system_fingerprint.startswith("b")
    assert res.choices[0].finish_reason == "length"
    assert res.choices[0].message.content is not None
    assert match_regex("(Suddenly)+", res.choices[0].message.content)
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@ -56,6 +56,8 @@ static T json_value(const json & body, const std::string & key, const T & defaul
    }
 }

+const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
+
 //
 // tokenizer and input processing utils
 //