mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-12 13:27:21 +01:00
server : --n-predict option document and cap to max value (#5549)
* server: document --n-predict * server: ensure client request cannot override n_predict if set * server: fix print usage LF in new --n-predict option
This commit is contained in:
parent
66c1968f7a
commit
36376abe05
@ -39,6 +39,7 @@ see https://github.com/ggerganov/llama.cpp/issues/1437
|
|||||||
- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
|
- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
|
||||||
- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`
|
- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`
|
||||||
- `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
|
- `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
|
||||||
|
- `-n, --n-predict`: Set the maximum tokens to predict (default: -1)
|
||||||
|
|
||||||
## Build
|
## Build
|
||||||
|
|
||||||
|
@ -159,6 +159,7 @@ struct llama_client_slot
|
|||||||
int32_t n_decoded = 0;
|
int32_t n_decoded = 0;
|
||||||
int32_t n_remaining = -1;
|
int32_t n_remaining = -1;
|
||||||
int32_t i_batch = -1;
|
int32_t i_batch = -1;
|
||||||
|
int32_t n_predict = -1;
|
||||||
|
|
||||||
int32_t num_prompt_tokens = 0;
|
int32_t num_prompt_tokens = 0;
|
||||||
int32_t num_prompt_tokens_processed = 0;
|
int32_t num_prompt_tokens_processed = 0;
|
||||||
@ -410,6 +411,7 @@ struct llama_server_context
|
|||||||
|
|
||||||
slot.id = i;
|
slot.id = i;
|
||||||
slot.n_ctx = n_ctx_slot;
|
slot.n_ctx = n_ctx_slot;
|
||||||
|
slot.n_predict = params.n_predict;
|
||||||
|
|
||||||
LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot);
|
LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot);
|
||||||
|
|
||||||
@ -546,6 +548,15 @@ struct llama_server_context
|
|||||||
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
|
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
|
||||||
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
|
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
|
||||||
|
|
||||||
|
if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) {
|
||||||
|
// Might be better to reject the request with a 400 ?
|
||||||
|
LOG_WARNING("Max tokens to predict exceeds server configuration", {
|
||||||
|
{"params.n_predict", slot->params.n_predict},
|
||||||
|
{"slot.n_predict", slot->n_predict},
|
||||||
|
});
|
||||||
|
slot->params.n_predict = slot->n_predict;
|
||||||
|
}
|
||||||
|
|
||||||
// infill
|
// infill
|
||||||
if (data.count("input_prefix") != 0)
|
if (data.count("input_prefix") != 0)
|
||||||
{
|
{
|
||||||
@ -1053,6 +1064,7 @@ struct llama_server_context
|
|||||||
|
|
||||||
return json {
|
return json {
|
||||||
{"n_ctx", slot.n_ctx},
|
{"n_ctx", slot.n_ctx},
|
||||||
|
{"n_predict", slot.n_predict},
|
||||||
{"model", params.model_alias},
|
{"model", params.model_alias},
|
||||||
{"seed", slot.params.seed},
|
{"seed", slot.params.seed},
|
||||||
{"temperature", slot.sparams.temp},
|
{"temperature", slot.sparams.temp},
|
||||||
@ -1915,13 +1927,14 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
|||||||
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
|
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
|
||||||
printf(" --log-disable disables logging to a file.\n");
|
printf(" --log-disable disables logging to a file.\n");
|
||||||
printf("\n");
|
printf("\n");
|
||||||
|
printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict);
|
||||||
printf(" --override-kv KEY=TYPE:VALUE\n");
|
printf(" --override-kv KEY=TYPE:VALUE\n");
|
||||||
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
|
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
|
||||||
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
|
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
|
||||||
printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`");
|
printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`");
|
||||||
printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`");
|
printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`");
|
||||||
printf(" --chat-template FORMAT_NAME");
|
printf(" --chat-template FORMAT_NAME");
|
||||||
printf(" set chat template, possible valus is: llama2, chatml (default %s)", sparams.chat_template.c_str());
|
printf(" set chat template, possible value is: llama2, chatml (default %s)", sparams.chat_template.c_str());
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user