diff --git a/examples/server/README.md b/examples/server/README.md index 5e3ae833b..ac5133d24 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -40,6 +40,7 @@ see https://github.com/ggerganov/llama.cpp/issues/1437 - `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w` - `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n` - `-n, --n-predict`: Set the maximum tokens to predict (default: -1) +- `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included. ## Build @@ -381,6 +382,69 @@ Notice that each `probs` is an array of length `n_probs`. }' ``` +- **GET** `/slots`: Returns the current slots processing state. Can be disabled with `--slots-endpoint-disable`. + +### Result JSON + +```json +[ + { + "dynatemp_exponent": 1.0, + "dynatemp_range": 0.0, + "frequency_penalty": 0.0, + "grammar": "", + "id": 0, + "ignore_eos": false, + "logit_bias": [], + "min_p": 0.05000000074505806, + "mirostat": 0, + "mirostat_eta": 0.10000000149011612, + "mirostat_tau": 5.0, + "model": "llama-2-7b-32k-instruct.Q2_K.gguf", + "n_ctx": 2048, + "n_keep": 0, + "n_predict": 100000, + "n_probs": 0, + "next_token": { + "has_next_token": true, + "n_remain": -1, + "num_tokens_predicted": 0, + "stopped_eos": false, + "stopped_limit": false, + "stopped_word": false, + "stopping_word": "" + }, + "penalize_nl": true, + "penalty_prompt_tokens": [], + "presence_penalty": 0.0, + "prompt": "Say hello to llama.cpp", + "repeat_last_n": 64, + "repeat_penalty": 1.100000023841858, + "samplers": [ + "top_k", + "tfs_z", + "typical_p", + "top_p", + "min_p", + "temperature" + ], + "seed": 42, + "state": 1, + "stop": [ + "\n" + ], + "stream": false, + "task_id": 0, + "temperature": 0.0, + "tfs_z": 1.0, + "top_k": 40, + "top_p": 0.949999988079071, + "typical_p": 1.0, + "use_penalty_prompt_tokens": false + } +] +``` + ## More examples ### Change system prompt on runtime diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 8145af867..4f2e9c898 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -41,6 +41,7 @@ struct server_params int32_t port = 8080; int32_t read_timeout = 600; int32_t write_timeout = 600; + bool slots_endpoint = true; }; bool server_verbose = false; @@ -1926,6 +1927,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf(" set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n"); printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n"); printf(" --log-disable disables logging to a file.\n"); + printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n"); printf("\n"); printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict); printf(" --override-kv KEY=TYPE:VALUE\n"); @@ -2374,6 +2376,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, log_set_target(stdout); LOG_INFO("logging to file is disabled.", {}); } + else if (arg == "--slots-endpoint-disable") + { + sparams.slots_endpoint = false; + } else if (arg == "--chat-template") { if (++i >= argc) @@ -2619,6 +2625,32 @@ int main(int argc, char **argv) } }); + if (sparams.slots_endpoint) { + svr.Get("/slots", [&](const httplib::Request&, httplib::Response& res) { + json slots; + for (llama_client_slot & slot : llama.slots) { + json slot_data = llama.get_formated_generation(slot); + slot_data["id"] = slot.id; + slot_data["task_id"] = slot.task_id; + slot_data["state"] = slot.state; + slot_data["prompt"] = slot.prompt; + slot_data["next_token"] = { + {"has_next_token", slot.has_next_token}, + {"n_remain", slot.n_remaining}, + {"num_tokens_predicted", slot.n_decoded}, + {"stopped_eos", slot.stopped_eos}, + {"stopped_word", slot.stopped_word}, + {"stopped_limit", slot.stopped_limit}, + {"stopping_word", slot.stopping_word}, + }; + + slots.push_back(slot_data); + } + res.set_content(slots.dump(), "application/json"); + res.status = 200; // HTTP OK + }); + } + svr.set_logger(log_server_request); svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep)