mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-26 14:20:31 +01:00
server: Add "tokens per second" information in the backend (#10548)
* add cmake rvv support * add timings * remove space * update readme * fix * fix code * remove empty line * add test --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
This commit is contained in:
parent
991f8aabee
commit
64ed2091b2
@ -133,6 +133,7 @@ struct common_params_sampling {
|
|||||||
bool penalize_nl = false; // consider newlines as a repeatable token
|
bool penalize_nl = false; // consider newlines as a repeatable token
|
||||||
bool ignore_eos = false;
|
bool ignore_eos = false;
|
||||||
bool no_perf = false; // disable performance metrics
|
bool no_perf = false; // disable performance metrics
|
||||||
|
bool timing_per_token = false;
|
||||||
|
|
||||||
std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
|
std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
|
||||||
|
|
||||||
|
@ -416,6 +416,8 @@ node index.js
|
|||||||
|
|
||||||
`samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values.
|
`samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values.
|
||||||
|
|
||||||
|
`timings_per_token`: Include prompt processing and text generation speed information in each response. Default: `false`
|
||||||
|
|
||||||
**Response format**
|
**Response format**
|
||||||
|
|
||||||
- Note: When using streaming mode (`stream`), only `content` and `stop` will be returned until end of completion.
|
- Note: When using streaming mode (`stream`), only `content` and `stop` will be returned until end of completion.
|
||||||
|
@ -177,6 +177,8 @@ struct server_slot {
|
|||||||
bool stopped_word = false;
|
bool stopped_word = false;
|
||||||
bool stopped_limit = false;
|
bool stopped_limit = false;
|
||||||
|
|
||||||
|
bool timings_per_token = false;
|
||||||
|
|
||||||
bool oaicompat = false;
|
bool oaicompat = false;
|
||||||
|
|
||||||
std::string oaicompat_model;
|
std::string oaicompat_model;
|
||||||
@ -882,6 +884,8 @@ struct server_context {
|
|||||||
slot.oaicompat_model = "";
|
slot.oaicompat_model = "";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
slot.timings_per_token = json_value(data, "timings_per_token", false);
|
||||||
|
|
||||||
slot.params.stream = json_value(data, "stream", false);
|
slot.params.stream = json_value(data, "stream", false);
|
||||||
slot.params.cache_prompt = json_value(data, "cache_prompt", true);
|
slot.params.cache_prompt = json_value(data, "cache_prompt", true);
|
||||||
slot.params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", defaults.n_predict));
|
slot.params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", defaults.n_predict));
|
||||||
@ -1279,6 +1283,7 @@ struct server_context {
|
|||||||
{"speculative.n_max", slot.params.speculative.n_max},
|
{"speculative.n_max", slot.params.speculative.n_max},
|
||||||
{"speculative.n_min", slot.params.speculative.n_min},
|
{"speculative.n_min", slot.params.speculative.n_min},
|
||||||
{"speculative.p_min", slot.params.speculative.p_min},
|
{"speculative.p_min", slot.params.speculative.p_min},
|
||||||
|
{"timings_per_token", slot.timings_per_token},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1336,6 +1341,10 @@ struct server_context {
|
|||||||
res.data["model"] = slot.oaicompat_model;
|
res.data["model"] = slot.oaicompat_model;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (slot.timings_per_token) {
|
||||||
|
res.data["timings"] = slot.get_formated_timings();
|
||||||
|
}
|
||||||
|
|
||||||
queue_results.send(res);
|
queue_results.send(res);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2274,12 +2283,17 @@ struct server_context {
|
|||||||
common_sampler_accept(slot.smpl, id, true);
|
common_sampler_accept(slot.smpl, id, true);
|
||||||
|
|
||||||
slot.n_decoded += 1;
|
slot.n_decoded += 1;
|
||||||
|
|
||||||
|
const int64_t t_current = ggml_time_us();
|
||||||
|
|
||||||
if (slot.n_decoded == 1) {
|
if (slot.n_decoded == 1) {
|
||||||
slot.t_start_generation = ggml_time_us();
|
slot.t_start_generation = t_current;
|
||||||
slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3;
|
slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3;
|
||||||
metrics.on_prompt_eval(slot);
|
metrics.on_prompt_eval(slot);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
slot.t_token_generation = (t_current - slot.t_start_generation) / 1e3;
|
||||||
|
|
||||||
completion_token_output result;
|
completion_token_output result;
|
||||||
result.tok = id;
|
result.tok = id;
|
||||||
|
|
||||||
|
@ -146,3 +146,20 @@ def test_invalid_chat_completion_req(messages):
|
|||||||
})
|
})
|
||||||
assert res.status_code == 400 or res.status_code == 500
|
assert res.status_code == 400 or res.status_code == 500
|
||||||
assert "error" in res.body
|
assert "error" in res.body
|
||||||
|
|
||||||
|
|
||||||
|
def test_chat_completion_with_timings_per_token():
|
||||||
|
global server
|
||||||
|
server.start()
|
||||||
|
res = server.make_stream_request("POST", "/chat/completions", data={
|
||||||
|
"max_tokens": 10,
|
||||||
|
"messages": [{"role": "user", "content": "test"}],
|
||||||
|
"stream": True,
|
||||||
|
"timings_per_token": True,
|
||||||
|
})
|
||||||
|
for data in res:
|
||||||
|
assert "timings" in data
|
||||||
|
assert "prompt_per_second" in data["timings"]
|
||||||
|
assert "predicted_per_second" in data["timings"]
|
||||||
|
assert "predicted_n" in data["timings"]
|
||||||
|
assert data["timings"]["predicted_n"] <= 10
|
||||||
|
@ -650,6 +650,10 @@ static json format_final_response_oaicompat(const json & request, const json & r
|
|||||||
res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
|
res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (result.contains("timings")) {
|
||||||
|
res.push_back({"timings", json_value(result, "timings", json::object())});
|
||||||
|
}
|
||||||
|
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -740,6 +744,11 @@ static std::vector<json> format_partial_response_oaicompat(const json & result,
|
|||||||
{"model", modelname},
|
{"model", modelname},
|
||||||
{"object", "chat.completion.chunk"}
|
{"object", "chat.completion.chunk"}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
if (result.contains("timings")) {
|
||||||
|
ret.push_back({"timings", json_value(result, "timings", json::object())});
|
||||||
|
}
|
||||||
|
|
||||||
if (!finish_reason.empty()) {
|
if (!finish_reason.empty()) {
|
||||||
int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
|
int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
|
||||||
int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
|
int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
|
||||||
|
Loading…
Reference in New Issue
Block a user