server : OAI API compatibility (#4198)

* Add openai-compatible POST /v1/chat/completions API endpoint to server example * fix code style * Update server README.md * Improve server README.md * Fix server.cpp code style according to review * server : some style changes * server : indentation * server : enable special tokens during tokenization by default * server : minor code style * server : change random string generator * straightforward /v1/models endpoint --------- Co-authored-by: kir-gadjello <111190790+kir-gadjello@users.noreply.github.com> Co-authored-by: Tobi Lütke <tobi@Tobis-MacBook-Pro.local>
2024-12-25 05:48:47 +01:00 · 2023-11-25 11:29:06 +02:00 · 2023-11-25 11:29:06 +02:00 · af19d35734
commit af19d35734
parent e9c13ff781
2 changed files with 413 additions and 11 deletions
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -234,6 +234,55 @@ node index.js
 -   **GET** `/props`: Return the required assistant name and anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
 -   **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only ChatML-tuned models, such as Dolphin, OpenOrca, OpenHermes, OpenChat-3.5, etc can be used with this endpoint. Compared to `api_like_OAI.py` this API implementation does not require a wrapper to be served.
    *Options:*
    See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such are `mirostat` are supported.
    *Examples:*
    You can use either Python `openai` library with appropriate checkpoints:
    ```python
    import openai
    client = openai.OpenAI(
        base_url="http://localhost:8080/v1", # "http://<Your api-server IP>:port"
        api_key = "sk-no-key-required"
    )
    completion = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."},
        {"role": "user", "content": "Write a limerick about python exceptions"}
    ]
    )
    print(completion.choices[0].message)
    ```
    ... or raw HTTP requests:
    ```shell
    curl http://localhost:8080/v1/chat/completions \
    -H "Content-Type: application/json" \
    -H "Authorization: Bearer no-key" \
    -d '{
    "model": "gpt-3.5-turbo",
    "messages": [
    {
        "role": "system",
        "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."
    },
    {
        "role": "user",
        "content": "Write a limerick about python exceptions"
    }
    ]
    }'
    ```
 ## More examples
 ### Change system prompt on runtime
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -29,6 +29,8 @@
 #define SERVER_VERBOSE 1
 #endif
 #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
 using json = nlohmann::json;
 struct server_params
@ -59,6 +61,10 @@ static bool server_verbose = false;
 #define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
 #define LOG_INFO(   MSG, ...) server_log("INFO",    __func__, __LINE__, MSG, __VA_ARGS__)
 json oaicompat_completion_params_parse(const json &body);
 std::string format_chatml(std::vector<json> messages);
 //
 // base64 utils (TODO: move to common in the future)
 //
@ -378,6 +384,9 @@ struct llama_client_slot
    bool stopped_word = false;
    bool stopped_limit = false;
    bool oaicompat = false;
    std::string oaicompat_model;
    std::string stopping_word;
    // sampling
@ -477,7 +486,7 @@ struct llama_client_slot
        };
    }
-    void print_timings() {
+    void print_timings() const {
        LOG_TEE("\n");
        LOG_TEE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
            __func__, t_prompt_processing, num_prompt_tokens_processed, t_prompt_processing / num_prompt_tokens_processed, 1e3 / t_prompt_processing * num_prompt_tokens_processed);
@ -609,6 +618,11 @@ struct llama_server_context
    std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const
    {
        // TODO: currently, we tokenize using special tokens by default
        //       this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
        //       but it's better compared to completely ignoring ChatML and other chat templates
        const bool TMP_FORCE_SPECIAL = true;
        // If `add_bos` is true, we only add BOS, when json_prompt is a string,
        // or the first element of the json_prompt array is a string.
        std::vector<llama_token> prompt_tokens;
@ -624,12 +638,12 @@ struct llama_server_context
                    std::vector<llama_token> p;
                    if (first)
                    {
-                        p = ::llama_tokenize(ctx, s, add_bos);
+                        p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
                        first = false;
                    }
                    else
                    {
-                        p = ::llama_tokenize(ctx, s, false);
+                        p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
                    }
                    prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
                }
@ -646,7 +660,7 @@ struct llama_server_context
        else
        {
            auto s = json_prompt.template get<std::string>();
-            prompt_tokens = ::llama_tokenize(ctx, s, add_bos);
+            prompt_tokens = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
        }
        return prompt_tokens;
@ -677,6 +691,14 @@ struct llama_server_context
        slot_params default_params;
        llama_sampling_params default_sparams;
        if (data.count("__oaicompat") != 0) {
            slot->oaicompat = true;
            slot->oaicompat_model = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
        } else {
            slot->oaicompat = false;
            slot->oaicompat_model = "";
        }
        slot->params.stream           = json_value(data, "stream",            false);
        slot->params.cache_prompt     = json_value(data, "cache_prompt",      false);
        slot->params.n_predict        = json_value(data, "n_predict",         default_params.n_predict);
@ -1170,6 +1192,12 @@ struct llama_server_context
            res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs_output);
        }
        if (slot.oaicompat)
        {
            res.result_json["oaicompat_token_ctr"] = slot.n_decoded;
            res.result_json["model"] = slot.oaicompat_model;
        }
        queue_results.push_back(res);
    }
@ -1217,6 +1245,12 @@ struct llama_server_context
            res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs);
        }
        if (slot.oaicompat)
        {
            res.result_json["oaicompat_token_ctr"] = slot.n_decoded;
            res.result_json["model"] = slot.oaicompat_model;
        }
        queue_results.push_back(res);
    }
@ -1257,7 +1291,7 @@ struct llama_server_context
        task_server task;
        task.id = id_gen++;
        task.target_id = 0;
-        task.data = data;
+        task.data = std::move(data);
        task.infill_mode = infill;
        task.embedding_mode = embedding;
        task.type = COMPLETION_TASK;
@ -2180,6 +2214,233 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
    }
 }
 static std::string random_string()
 {
    static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
    std::random_device rd;
    std::mt19937 generator(rd());
    std::string result(32, ' ');
    for (int i = 0; i < 32; ++i) {
        result[i] = str[generator() % str.size()];
    }
    return result;
 }
 static std::string gen_chatcmplid()
 {
    std::stringstream chatcmplid;
    chatcmplid << "chatcmpl-" << random_string();
    return chatcmplid.str();
 }
 std::string format_chatml(std::vector<json> messages)
 {
    std::ostringstream chatml_msgs;
    for (auto it = messages.begin(); it != messages.end(); ++it) {
        chatml_msgs << "<|im_start|>"
                    << json_value(*it, "role",    std::string("user")) << '\n';
        chatml_msgs << json_value(*it, "content", std::string(""))
                    << "<|im_end|>\n";
    }
    chatml_msgs << "<|im_start|>assistant" << '\n';
    return chatml_msgs.str();
 }
 /* llama.cpp completion api semantics */
 json oaicompat_completion_params_parse(
    const json &body /* openai api json semantics */)
 {
    json llama_params;
    llama_params["__oaicompat"] = true;
    // Map OpenAI parameters to llama.cpp parameters
    llama_params["prompt"]            = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
    llama_params["temperature"]       = json_value(body, "temperature", 0.8);
    llama_params["top_k"]             = json_value(body, "top_k", 40);
    llama_params["top_p"]             = json_value(body, "top_p", 0.95);
    llama_params["n_predict"]         = json_value(body, "max_tokens", -1);
    llama_params["logit_bias"]        = json_value(body, "logit_bias",json::object());
    llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
    llama_params["presence_penalty"]  = json_value(body, "presence_penalty", 0.0);
    llama_params["seed"]              = json_value(body, "seed", 0);
    llama_params["stream"]            = json_value(body, "stream", false);
    llama_params["mirostat"]          = json_value(body, "mirostat", false);
    llama_params["mirostat_tau"]      = json_value(body, "mirostat_tau", 0.0);
    llama_params["mirostat_eta"]      = json_value(body, "mirostat_eta", 0.0);
    llama_params["penalize_nl"]       = json_value(body, "penalize_nl", false);
    llama_params["typical_p"]         = json_value(body, "typical_p", 0.0);
    llama_params["repeat_last_n"]     = json_value(body, "repeat_last_n", 0);
    llama_params["ignore_eos"]        = json_value(body, "ignore_eos", false);
    llama_params["tfs_z"]             = json_value(body, "tfs_z", 0.0);
    if (llama_params.count("grammar") != 0) {
        llama_params["grammar"] = json_value(body, "grammar", json::object());
    }
    // Handle 'stop' field
    if (body["stop"].is_null()) {
        llama_params["stop"] = json::array({});
    } else if (body["stop"].is_string()) {
        llama_params["stop"] = json::array({body["stop"].get<std::string>()});
    } else {
        llama_params["stop"] = json_value(body, "stop", json::array());
    }
    // Ensure there is ChatML-specific end sequence among stop words
    llama_params["stop"].push_back("<|im_end|>");
    return llama_params;
 }
 static json format_final_response_oaicompat(const json &request, const task_result &response, bool streaming = false)
 {
    json result = response.result_json;
    bool stopped_word        = result.count("stopped_word") != 0;
    bool stopped_eos         = json_value(result, "stopped_eos", false);
    int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
    int num_prompt_tokens    = json_value(result, "tokens_evaluated", 0);
    std::string content      = json_value(result, "content", std::string(""));
    std::string finish_reason = "length";
    if (stopped_word || stopped_eos) {
        finish_reason = "stop";
    }
    json choices =
        streaming ? json::array({json{{"finish_reason", finish_reason},
                                        {"index", 0},
                                        {"delta", json::object()}}})
                  : json::array({json{{"finish_reason", finish_reason},
                                        {"index", 0},
                                        {"message", json{{"content", content},
                                                         {"role", "assistant"}}}}});
    std::time_t t = std::time(0);
    json res =
        json{{"choices", choices},
            {"created", t},
            {"model",
                json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
            {"object", streaming ? "chat.completion.chunk" : "chat.completion"},
            {"usage",
                json{{"completion_tokens", num_tokens_predicted},
                    {"prompt_tokens", num_prompt_tokens},
                    {"total_tokens", num_tokens_predicted + num_prompt_tokens}}},
            {"id", gen_chatcmplid()}};
    if (server_verbose) {
        res["__verbose"] = result;
    }
    if (result.contains("completion_probabilities")) {
        res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
    }
    return res;
 }
 // return value is vector as there is one case where we might need to generate two responses
 static std::vector<json> format_partial_response_oaicompat(const task_result &response) {
    json result = response.result_json;
    if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
        return std::vector<json>({response.result_json});
    }
    bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
    std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
    bool stopped_word   = json_value(result, "stopped_word", false);
    bool stopped_eos    = json_value(result, "stopped_eos", false);
    bool stopped_limit  = json_value(result, "stopped_limit", false);
    std::string content = json_value(result, "content", std::string(""));
    std::string finish_reason;
    if (stopped_word || stopped_eos) {
        finish_reason = "stop";
    }
    if (stopped_limit) {
        finish_reason = "length";
    }
    std::time_t t = std::time(0);
    json choices;
    if (!finish_reason.empty()) {
        choices = json::array({json{{"finish_reason", finish_reason},
                                    {"index", 0},
                                    {"delta", json::object()}}});
    } else {
        if (first) {
            if (content.empty()) {
                choices = json::array({json{{"finish_reason", nullptr},
                                            {"index", 0},
                                            {"delta", json{{"role", "assistant"}}}}});
            } else {
                // We have to send this as two updates to conform to openai behavior
                json initial_ret = json{{"choices", json::array({json{
                                        {"finish_reason", nullptr},
                                        {"index", 0},
                                        {"delta", json{
                                            {"role", "assistant"}
                                        }}}})},
                            {"created", t},
                            {"id", gen_chatcmplid()},
                            {"model", modelname},
                            {"object", "chat.completion.chunk"}};
                json second_ret = json{
                            {"choices", json::array({json{{"finish_reason", nullptr},
                                                            {"index", 0},
                                                            {"delta", json{
                                                            {"content", content}}}
                                                            }})},
                            {"created", t},
                            {"id", gen_chatcmplid()},
                            {"model", modelname},
                            {"object", "chat.completion.chunk"}};
                return std::vector<json>({initial_ret, second_ret});
            }
        } else {
            // Some idiosyncrasy in task processing logic makes several trailing calls
            // with empty content, we ignore these at the calee site.
            if (content.empty()) {
                return std::vector<json>({json::object()});
            }
            choices = json::array({json{
                {"finish_reason", nullptr},
                {"index", 0},
                {"delta",
                json{
                    {"content", content},
                }},
            }});
        }
    }
    json ret = json{{"choices", choices},
                    {"created", t},
                    {"id", gen_chatcmplid()},
                    {"model", modelname},
                    {"object", "chat.completion.chunk"}};
    return std::vector<json>({ret});
 }
 static json format_partial_response(
    llama_server_context &llama, llama_client_slot *slot, const std::string &content, const std::vector<completion_token_output> &probs
 ) {
@ -2398,6 +2659,98 @@ int main(int argc, char **argv)
                }
            });
    svr.Get("/v1/models", [&params](const httplib::Request&, httplib::Response& res)
            {
                std::time_t t = std::time(0);
                json models = {
                    {"object", "list"},
                    {"data", {
                        {
                            {"id", params.model_alias},
                            {"object", "model"},
                            {"created", t},
                            {"owned_by", "llamacpp"}
                        },
                    }}
                };
                res.set_content(models.dump(), "application/json");
            });
    // TODO: add mount point without "/v1" prefix -- how?
    svr.Post("/v1/chat/completions", [&llama](const httplib::Request &req, httplib::Response &res)
            {
                json data = oaicompat_completion_params_parse(json::parse(req.body));
                const int task_id = llama.request_completion(data, false, false);
                if (!json_value(data, "stream", false)) {
                    std::string completion_text;
                    task_result result = llama.next_result(task_id);
                    if (!result.error && result.stop) {
                        json oaicompat_result = format_final_response_oaicompat(data, result);
                        res.set_content(oaicompat_result.dump(-1, ' ', false,
                                            json::error_handler_t::replace),
                                            "application/json");
                    } else {
                        res.status = 500;
                        res.set_content(result.result_json["content"], "text/plain");
                        return;
                    }
                } else {
                    const auto chunked_content_provider = [task_id, &llama](size_t, httplib::DataSink &sink) {
                        while (true) {
                            task_result llama_result = llama.next_result(task_id);
                            if (!llama_result.error) {
                                std::vector<json> result_array = format_partial_response_oaicompat( llama_result);
                                for (auto it = result_array.begin(); it != result_array.end(); ++it)
                                {
                                    if (!it->empty()) {
                                        const std::string str =
                                            "data: " +
                                            it->dump(-1, ' ', false, json::error_handler_t::replace) +
                                            "\n\n";
                                        LOG_VERBOSE("data stream", {{"to_send", str}});
                                        if (!sink.write(str.c_str(), str.size())) {
                                            return false;
                                        }
                                    }
                                }
                                if (llama_result.stop) {
                                    break;
                                }
                            } else {
                                const std::string str =
                                    "error: " +
                                    llama_result.result_json.dump(-1, ' ', false,
                                            json::error_handler_t::replace) +
                                    "\n\n";
                                LOG_VERBOSE("data stream", {{"to_send", str}});
                                if (!sink.write(str.c_str(), str.size())) {
                                    return false;
                                }
                                break;
                            }
                        }
                        sink.done();
                        return true;
                    };
                    auto on_complete = [task_id, &llama](bool) {
                        // cancel request
                        llama.request_cancel(task_id);
                    };
                    res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
                }
            });
    svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res)
            {
                json data = json::parse(req.body);