diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 384b87f7e..2cbfc0018 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -915,14 +915,14 @@ static json format_timings(llama_server_context &llama) return json{ {"prompt_n", timings.n_eval}, - {"prompt_ms", timings.prompt_eval_time_ms}, - {"prompt_per_token_ms", timings.prompt_eval_time_ms / timings.n_p_eval}, - {"prompt_per_second", 1e3 / timings.prompt_eval_time_ms * timings.n_p_eval}, + {"prompt_ms", timings.t_p_eval_ms}, + {"prompt_per_token_ms", timings.t_p_eval_ms / timings.n_p_eval}, + {"prompt_per_second", 1e3 / timings.t_p_eval_ms * timings.n_p_eval}, {"predicted_n", timings.n_eval}, - {"predicted_ms", timings.eval_time_ms}, - {"predicted_per_token_ms", timings.eval_time_ms / timings.n_eval}, - {"predicted_per_second", 1e3 / timings.eval_time_ms * timings.n_eval}, + {"predicted_ms", timings.t_eval_ms}, + {"predicted_per_token_ms", timings.t_eval_ms / timings.n_eval}, + {"predicted_per_second", 1e3 / timings.t_eval_ms * timings.n_eval}, }; } diff --git a/llama.cpp b/llama.cpp index 2017a709a..02afdeb14 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3480,34 +3480,34 @@ llama_token llama_token_nl() { return 13; } -llama_timings llama_get_timings(struct llama_context * ctx) { - llama_timings timings; +struct llama_timings llama_get_timings(struct llama_context * ctx) { + struct llama_timings result = { + /*.t_start_ms =*/ 1e-3 * ctx->t_start_us, + /*.t_end_ms =*/ 1.00 * ggml_time_ms(), + /*.t_load_ms =*/ 1e-3 * ctx->t_load_us, + /*.t_sample_ms =*/ 1e-3 * ctx->t_sample_us, + /*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us, + /*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us, - timings.t_end_ms = ggml_time_ms(); - timings.t_start_ms = 1e-3 * ctx->t_start_us; - timings.load_time_ms = 1e-3 * ctx->t_load_us; - timings.sample_time_ms = 1e-3 * ctx->t_sample_us; - timings.prompt_eval_time_ms = 1e-3 * ctx->t_p_eval_us; - timings.eval_time_ms = 1e-3 * ctx->t_eval_us; + /*.n_sample =*/ std::max(1, ctx->n_sample), + /*.n_p_eval =*/ std::max(1, ctx->n_p_eval), + /*.n_eval =*/ std::max(1, ctx->n_eval), + }; - timings.n_sample = std::max(1, ctx->n_sample); - timings.n_p_eval = std::max(1, ctx->n_p_eval); - timings.n_eval = std::max(1, ctx->n_eval); - - return timings; + return result; } void llama_print_timings(struct llama_context * ctx) { - llama_timings timings = llama_get_timings(ctx); + const llama_timings timings = llama_get_timings(ctx); fprintf(stderr, "\n"); - fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, timings.load_time_ms); + fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, timings.t_load_ms); fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, timings.sample_time_ms, timings.n_sample, timings.sample_time_ms / timings.n_sample, 1e3 / timings.sample_time_ms * timings.n_sample); + __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample); fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, timings.prompt_eval_time_ms, timings.n_p_eval, timings.prompt_eval_time_ms / timings.n_p_eval, 1e3 / timings.prompt_eval_time_ms * timings.n_p_eval); + __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval); fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, timings.eval_time_ms, timings.n_eval, timings.eval_time_ms / timings.n_eval, 1e3 / timings.eval_time_ms * timings.n_eval); + __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval); fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms)); } diff --git a/llama.h b/llama.h index 7dea68248..c1e7dab9f 100644 --- a/llama.h +++ b/llama.h @@ -138,10 +138,10 @@ extern "C" { struct llama_timings { double t_start_ms; double t_end_ms; - double load_time_ms; - double sample_time_ms; - double prompt_eval_time_ms; - double eval_time_ms; + double t_load_ms; + double t_sample_ms; + double t_p_eval_ms; + double t_eval_ms; int32_t n_sample; int32_t n_p_eval; @@ -345,7 +345,7 @@ extern "C" { LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates); // Performance information - LLAMA_API llama_timings llama_get_timings(struct llama_context * ctx); + LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx); LLAMA_API void llama_print_timings(struct llama_context * ctx); LLAMA_API void llama_reset_timings(struct llama_context * ctx);