From 36376abe05a12a8cb3af548a4af9b8d0e2e69597 Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Sun, 18 Feb 2024 17:30:09 +0100 Subject: [PATCH] server : --n-predict option document and cap to max value (#5549) * server: document --n-predict * server: ensure client request cannot override n_predict if set * server: fix print usage LF in new --n-predict option --- examples/server/README.md | 1 + examples/server/server.cpp | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/examples/server/README.md b/examples/server/README.md index 249368749..fe5cd8d5d 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -39,6 +39,7 @@ see https://github.com/ggerganov/llama.cpp/issues/1437 - `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA. - `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w` - `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n` +- `-n, --n-predict`: Set the maximum tokens to predict (default: -1) ## Build diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 7800c6e7e..7aa706e95 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -159,6 +159,7 @@ struct llama_client_slot int32_t n_decoded = 0; int32_t n_remaining = -1; int32_t i_batch = -1; + int32_t n_predict = -1; int32_t num_prompt_tokens = 0; int32_t num_prompt_tokens_processed = 0; @@ -410,6 +411,7 @@ struct llama_server_context slot.id = i; slot.n_ctx = n_ctx_slot; + slot.n_predict = params.n_predict; LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot); @@ -546,6 +548,15 @@ struct llama_server_context slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar); slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs); + if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) { + // Might be better to reject the request with a 400 ? + LOG_WARNING("Max tokens to predict exceeds server configuration", { + {"params.n_predict", slot->params.n_predict}, + {"slot.n_predict", slot->n_predict}, + }); + slot->params.n_predict = slot->n_predict; + } + // infill if (data.count("input_prefix") != 0) { @@ -1053,6 +1064,7 @@ struct llama_server_context return json { {"n_ctx", slot.n_ctx}, + {"n_predict", slot.n_predict}, {"model", params.model_alias}, {"seed", slot.params.seed}, {"temperature", slot.sparams.temp}, @@ -1915,13 +1927,14 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n"); printf(" --log-disable disables logging to a file.\n"); printf("\n"); + printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict); printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" advanced option to override model metadata by key. may be specified multiple times.\n"); printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`"); printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`"); printf(" --chat-template FORMAT_NAME"); - printf(" set chat template, possible valus is: llama2, chatml (default %s)", sparams.chat_template.c_str()); + printf(" set chat template, possible value is: llama2, chatml (default %s)", sparams.chat_template.c_str()); printf("\n"); }