mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-12 05:17:21 +01:00
server: metrics: add llamacpp:prompt_seconds_total and llamacpp:tokens_predicted_seconds_total, reset bucket only on /metrics. Fix values cast to int. Add Process-Start-Time-Unix header. (#5937)
Closes #5850
This commit is contained in:
parent
e457fb3540
commit
76e868821a
@ -335,8 +335,12 @@ struct server_slot {
|
|||||||
};
|
};
|
||||||
|
|
||||||
struct server_metrics {
|
struct server_metrics {
|
||||||
|
const int64_t t_start = ggml_time_us();
|
||||||
|
|
||||||
uint64_t n_prompt_tokens_processed_total = 0;
|
uint64_t n_prompt_tokens_processed_total = 0;
|
||||||
|
uint64_t t_prompt_processing_total = 0;
|
||||||
uint64_t n_tokens_predicted_total = 0;
|
uint64_t n_tokens_predicted_total = 0;
|
||||||
|
uint64_t t_tokens_generation_total = 0;
|
||||||
|
|
||||||
uint64_t n_prompt_tokens_processed = 0;
|
uint64_t n_prompt_tokens_processed = 0;
|
||||||
uint64_t t_prompt_processing = 0;
|
uint64_t t_prompt_processing = 0;
|
||||||
@ -348,12 +352,14 @@ struct server_metrics {
|
|||||||
n_prompt_tokens_processed_total += slot.n_prompt_tokens_processed;
|
n_prompt_tokens_processed_total += slot.n_prompt_tokens_processed;
|
||||||
n_prompt_tokens_processed += slot.n_prompt_tokens_processed;
|
n_prompt_tokens_processed += slot.n_prompt_tokens_processed;
|
||||||
t_prompt_processing += slot.t_prompt_processing;
|
t_prompt_processing += slot.t_prompt_processing;
|
||||||
|
t_prompt_processing_total += slot.t_prompt_processing;
|
||||||
}
|
}
|
||||||
|
|
||||||
void on_prediction(const server_slot &slot) {
|
void on_prediction(const server_slot &slot) {
|
||||||
n_tokens_predicted_total += slot.n_decoded;
|
n_tokens_predicted_total += slot.n_decoded;
|
||||||
n_tokens_predicted += slot.n_decoded;
|
n_tokens_predicted += slot.n_decoded;
|
||||||
t_tokens_generation += slot.t_token_generation;
|
t_tokens_generation += slot.t_token_generation;
|
||||||
|
t_tokens_generation_total += slot.t_token_generation;
|
||||||
}
|
}
|
||||||
|
|
||||||
void reset_bucket() {
|
void reset_bucket() {
|
||||||
@ -1502,9 +1508,12 @@ struct server_context {
|
|||||||
{ "idle", n_idle_slots },
|
{ "idle", n_idle_slots },
|
||||||
{ "processing", n_processing_slots },
|
{ "processing", n_processing_slots },
|
||||||
{ "deferred", queue_tasks.queue_tasks_deferred.size() },
|
{ "deferred", queue_tasks.queue_tasks_deferred.size() },
|
||||||
|
{ "t_start", metrics.t_start},
|
||||||
|
|
||||||
{ "n_prompt_tokens_processed_total", metrics.n_prompt_tokens_processed_total},
|
{ "n_prompt_tokens_processed_total", metrics.n_prompt_tokens_processed_total},
|
||||||
|
{ "t_tokens_generation_total", metrics.t_tokens_generation_total},
|
||||||
{ "n_tokens_predicted_total", metrics.n_tokens_predicted_total},
|
{ "n_tokens_predicted_total", metrics.n_tokens_predicted_total},
|
||||||
|
{ "t_prompt_processing_total", metrics.t_prompt_processing_total},
|
||||||
|
|
||||||
{ "n_prompt_tokens_processed", metrics.n_prompt_tokens_processed},
|
{ "n_prompt_tokens_processed", metrics.n_prompt_tokens_processed},
|
||||||
{ "t_prompt_processing", metrics.t_prompt_processing},
|
{ "t_prompt_processing", metrics.t_prompt_processing},
|
||||||
@ -1517,7 +1526,9 @@ struct server_context {
|
|||||||
{ "slots", slots_data },
|
{ "slots", slots_data },
|
||||||
};
|
};
|
||||||
|
|
||||||
|
if (json_value(task.data, "reset_bucket", false)) {
|
||||||
metrics.reset_bucket();
|
metrics.reset_bucket();
|
||||||
|
}
|
||||||
queue_results.send(res);
|
queue_results.send(res);
|
||||||
} break;
|
} break;
|
||||||
}
|
}
|
||||||
@ -2709,6 +2720,7 @@ int main(int argc, char ** argv) {
|
|||||||
task.id_multi = -1;
|
task.id_multi = -1;
|
||||||
task.id_target = -1;
|
task.id_target = -1;
|
||||||
task.type = SERVER_TASK_TYPE_METRICS;
|
task.type = SERVER_TASK_TYPE_METRICS;
|
||||||
|
task.data.push_back({{"reset_bucket", true}});
|
||||||
|
|
||||||
ctx_server.queue_results.add_waiting_task_id(task.id);
|
ctx_server.queue_results.add_waiting_task_id(task.id);
|
||||||
ctx_server.queue_tasks.post(task);
|
ctx_server.queue_tasks.post(task);
|
||||||
@ -2732,20 +2744,28 @@ int main(int argc, char ** argv) {
|
|||||||
{"counter", {{
|
{"counter", {{
|
||||||
{"name", "prompt_tokens_total"},
|
{"name", "prompt_tokens_total"},
|
||||||
{"help", "Number of prompt tokens processed."},
|
{"help", "Number of prompt tokens processed."},
|
||||||
{"value", data["n_prompt_tokens_processed_total"]}
|
{"value", (uint64_t) data["n_prompt_tokens_processed_total"]}
|
||||||
|
}, {
|
||||||
|
{"name", "prompt_seconds_total"},
|
||||||
|
{"help", "Prompt process time"},
|
||||||
|
{"value", (uint64_t) data["t_prompt_processing_total"] / 1.e3}
|
||||||
}, {
|
}, {
|
||||||
{"name", "tokens_predicted_total"},
|
{"name", "tokens_predicted_total"},
|
||||||
{"help", "Number of generation tokens processed."},
|
{"help", "Number of generation tokens processed."},
|
||||||
{"value", data["n_tokens_predicted_total"]}
|
{"value", (uint64_t) data["n_tokens_predicted_total"]}
|
||||||
|
}, {
|
||||||
|
{"name", "tokens_predicted_seconds_total"},
|
||||||
|
{"help", "Predict process time"},
|
||||||
|
{"value", (uint64_t) data["t_tokens_generation_total"] / 1.e3}
|
||||||
}}},
|
}}},
|
||||||
{"gauge", {{
|
{"gauge", {{
|
||||||
{"name", "prompt_tokens_seconds"},
|
{"name", "prompt_tokens_seconds"},
|
||||||
{"help", "Average prompt throughput in tokens/s."},
|
{"help", "Average prompt throughput in tokens/s."},
|
||||||
{"value", n_prompt_tokens_processed ? 1e3 / t_prompt_processing * n_prompt_tokens_processed : 0}
|
{"value", n_prompt_tokens_processed ? 1.e3 / t_prompt_processing * n_prompt_tokens_processed : 0.}
|
||||||
},{
|
},{
|
||||||
{"name", "predicted_tokens_seconds"},
|
{"name", "predicted_tokens_seconds"},
|
||||||
{"help", "Average generation throughput in tokens/s."},
|
{"help", "Average generation throughput in tokens/s."},
|
||||||
{"value", n_tokens_predicted ? 1e3 / t_tokens_generation * n_tokens_predicted : 0}
|
{"value", n_tokens_predicted ? 1.e3 / t_tokens_generation * n_tokens_predicted : 0.}
|
||||||
},{
|
},{
|
||||||
{"name", "kv_cache_usage_ratio"},
|
{"name", "kv_cache_usage_ratio"},
|
||||||
{"help", "KV-cache usage. 1 means 100 percent usage."},
|
{"help", "KV-cache usage. 1 means 100 percent usage."},
|
||||||
@ -2753,15 +2773,15 @@ int main(int argc, char ** argv) {
|
|||||||
},{
|
},{
|
||||||
{"name", "kv_cache_tokens"},
|
{"name", "kv_cache_tokens"},
|
||||||
{"help", "KV-cache tokens."},
|
{"help", "KV-cache tokens."},
|
||||||
{"value", data["kv_cache_tokens_count"]}
|
{"value", (uint64_t) data["kv_cache_tokens_count"]}
|
||||||
},{
|
},{
|
||||||
{"name", "requests_processing"},
|
{"name", "requests_processing"},
|
||||||
{"help", "Number of request processing."},
|
{"help", "Number of request processing."},
|
||||||
{"value", data["processing"]}
|
{"value", (uint64_t) data["processing"]}
|
||||||
},{
|
},{
|
||||||
{"name", "requests_deferred"},
|
{"name", "requests_deferred"},
|
||||||
{"help", "Number of request deferred."},
|
{"help", "Number of request deferred."},
|
||||||
{"value", data["deferred"]}
|
{"value", (uint64_t) data["deferred"]}
|
||||||
}}}
|
}}}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -2775,13 +2795,16 @@ int main(int argc, char ** argv) {
|
|||||||
const std::string name = metric_def["name"];
|
const std::string name = metric_def["name"];
|
||||||
const std::string help = metric_def["help"];
|
const std::string help = metric_def["help"];
|
||||||
|
|
||||||
auto value = json_value(metric_def, "value", 0);
|
auto value = json_value(metric_def, "value", 0.);
|
||||||
prometheus << "# HELP llamacpp:" << name << " " << help << "\n"
|
prometheus << "# HELP llamacpp:" << name << " " << help << "\n"
|
||||||
<< "# TYPE llamacpp:" << name << " " << type << "\n"
|
<< "# TYPE llamacpp:" << name << " " << type << "\n"
|
||||||
<< "llamacpp:" << name << " " << value << "\n";
|
<< "llamacpp:" << name << " " << value << "\n";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const int64_t t_start = data["t_start"];
|
||||||
|
res.set_header("Process-Start-Time-Unix", std::to_string(t_start));
|
||||||
|
|
||||||
res.set_content(prometheus.str(), "text/plain; version=0.0.4");
|
res.set_content(prometheus.str(), "text/plain; version=0.0.4");
|
||||||
res.status = 200; // HTTP OK
|
res.status = 200; // HTTP OK
|
||||||
});
|
});
|
||||||
|
@ -29,6 +29,7 @@ Feature: llama.cpp server
|
|||||||
And a completion request with no api error
|
And a completion request with no api error
|
||||||
Then <n_predicted> tokens are predicted matching <re_content>
|
Then <n_predicted> tokens are predicted matching <re_content>
|
||||||
And prometheus metrics are exposed
|
And prometheus metrics are exposed
|
||||||
|
And metric llamacpp:tokens_predicted is <n_predicted>
|
||||||
|
|
||||||
Examples: Prompts
|
Examples: Prompts
|
||||||
| prompt | n_predict | re_content | n_predicted |
|
| prompt | n_predict | re_content | n_predicted |
|
||||||
|
@ -586,14 +586,24 @@ async def step_prometheus_metrics_exported(context):
|
|||||||
metric_exported = False
|
metric_exported = False
|
||||||
if context.debug:
|
if context.debug:
|
||||||
print(f"/metrics answer:\n{metrics_raw}\n")
|
print(f"/metrics answer:\n{metrics_raw}\n")
|
||||||
|
context.metrics = {}
|
||||||
for metric in parser.text_string_to_metric_families(metrics_raw):
|
for metric in parser.text_string_to_metric_families(metrics_raw):
|
||||||
match metric.name:
|
match metric.name:
|
||||||
case "llamacpp:kv_cache_usage_ratio":
|
case "llamacpp:kv_cache_usage_ratio":
|
||||||
assert len(metric.samples) > 0
|
assert len(metric.samples) > 0
|
||||||
metric_exported = True
|
metric_exported = True
|
||||||
|
context.metrics[metric.name] = metric
|
||||||
|
assert int(metrics_response.headers["Process-Start-Time-Unix"]) > 0, "no header process start time"
|
||||||
assert metric_exported, "No metrics exported"
|
assert metric_exported, "No metrics exported"
|
||||||
|
|
||||||
|
|
||||||
|
@step(u'metric {metric_name} is {metric_value:d}')
|
||||||
|
def step_assert_metric_value(context, metric_name, metric_value):
|
||||||
|
if metric_name not in context.metrics:
|
||||||
|
assert False, f"no metric {metric_name} in {context.metrics.keys()}"
|
||||||
|
assert context.metrics[metric_name].samples[0].value == metric_value, f"metric: {context.metrics[metric_name]}"
|
||||||
|
|
||||||
|
|
||||||
@step(u'available models')
|
@step(u'available models')
|
||||||
def step_available_models(context):
|
def step_available_models(context):
|
||||||
# openai client always expects an api_key
|
# openai client always expects an api_key
|
||||||
@ -879,7 +889,6 @@ def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re
|
|||||||
f' {n_predicted} <> {expected_predicted_n}')
|
f' {n_predicted} <> {expected_predicted_n}')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
async def gather_tasks_results(context):
|
async def gather_tasks_results(context):
|
||||||
n_tasks = len(context.concurrent_tasks)
|
n_tasks = len(context.concurrent_tasks)
|
||||||
if context.debug:
|
if context.debug:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user