From 2f0ee84b9b02d2a98742308026f060ebdc2423f1 Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Thu, 2 Jan 2025 18:06:12 +0100 Subject: [PATCH] server: bench: minor fixes (#10765) * server/bench: - support openAI streaming standard output with [DONE]\n\n - export k6 raw results in csv - fix too many tcp idle connection in tcp_wait - add metric time to emit first token * server/bench: - fix when prometheus not started - wait for server to be ready before starting bench --- examples/server/bench/README.md | 6 +++--- examples/server/bench/bench.py | 30 +++++++++++++++++++++--------- examples/server/bench/script.js | 18 +++++++++++++++--- 3 files changed, 39 insertions(+), 15 deletions(-) diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md index 353368e13..9549795ec 100644 --- a/examples/server/bench/README.md +++ b/examples/server/bench/README.md @@ -6,10 +6,10 @@ Benchmark is using [k6](https://k6.io/). SSE is not supported by default in k6, you have to build k6 with the [xk6-sse](https://github.com/phymbert/xk6-sse) extension. -Example: +Example (assuming golang >= 1.21 is installed): ```shell go install go.k6.io/xk6/cmd/xk6@latest -xk6 build master \ +$GOPATH/bin/xk6 build master \ --with github.com/phymbert/xk6-sse ``` @@ -33,7 +33,7 @@ The server must answer OAI Chat completion requests on `http://localhost:8080/v1 Example: ```shell -server --host localhost --port 8080 \ +llama-server --host localhost --port 8080 \ --model ggml-model-q4_0.gguf \ --cont-batching \ --metrics \ diff --git a/examples/server/bench/bench.py b/examples/server/bench/bench.py index a9ed747f5..5cc6f92ab 100644 --- a/examples/server/bench/bench.py +++ b/examples/server/bench/bench.py @@ -189,12 +189,12 @@ xychart-beta "pp": { "p95": round(data['metrics']["llamacpp_prompt_processing_second"]["p(95)"], 2), "avg": round(data['metrics']["llamacpp_prompt_processing_second"]["avg"], 2), - "0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2), + "0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2) if 'prompt_tokens_seconds' in prometheus_metrics else 0, }, "tg": { "p95": round(data['metrics']["llamacpp_tokens_second"]["p(95)"], 2), "avg": round(data['metrics']["llamacpp_tokens_second"]["avg"], 2), - "0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2), + "0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2) if 'predicted_tokens_seconds' in prometheus_metrics else 0, }, } with open("results.github.env", 'a') as github_env: @@ -214,11 +214,14 @@ def start_benchmark(args): k6_args = [ 'run', args.scenario, '--no-color', + '--no-connection-reuse', + '--no-vu-connection-reuse', ] k6_args.extend(['--duration', args.duration]) k6_args.extend(['--iterations', args.n_prompts]) k6_args.extend(['--vus', args.parallel]) k6_args.extend(['--summary-export', 'k6-results.json']) + k6_args.extend(['--out', 'csv=k6-results.csv']) args = f"SERVER_BENCH_N_PROMPTS={args.n_prompts} SERVER_BENCH_MAX_PROMPT_TOKENS={args.max_prompt_tokens} SERVER_BENCH_MAX_CONTEXT={args.max_tokens} " args = args + ' '.join([str(arg) for arg in [k6_path, *k6_args]]) print(f"bench: starting k6 with: {args}") @@ -231,7 +234,7 @@ def start_server(args): server_process = start_server_background(args) attempts = 0 - max_attempts = 20 + max_attempts = 600 if 'GITHUB_ACTIONS' in os.environ: max_attempts *= 2 @@ -242,7 +245,15 @@ def start_server(args): print(f"bench: waiting for server to start ...") time.sleep(0.5) - print("bench: server started.") + attempts = 0 + while not is_server_ready(args.host, args.port): + attempts += 1 + if attempts > max_attempts: + assert False, "server not ready" + print(f"bench: waiting for server to be ready ...") + time.sleep(0.5) + + print("bench: server started and ready.") return server_process @@ -255,11 +266,6 @@ def start_server_background(args): '--host', args.host, '--port', args.port, ] - model_file = args.model_path_prefix + os.path.sep + args.hf_file - model_dir = os.path.dirname(model_file) - if not os.path.exists(model_dir): - os.makedirs(model_dir) - server_args.extend(['--model', model_file]) server_args.extend(['--hf-repo', args.hf_repo]) server_args.extend(['--hf-file', args.hf_file]) server_args.extend(['--n-gpu-layers', args.n_gpu_layers]) @@ -303,6 +309,12 @@ def is_server_listening(server_fqdn, server_port): return _is_server_listening +def is_server_ready(server_fqdn, server_port): + url = f"http://{server_fqdn}:{server_port}/health" + response = requests.get(url) + return response.status_code == 200 + + def escape_metric_name(metric_name): return re.sub('[^A-Z0-9]', '_', metric_name.upper()) diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js index bdf4f5abc..2772bee5e 100644 --- a/examples/server/bench/script.js +++ b/examples/server/bench/script.js @@ -56,6 +56,7 @@ const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens') const llamacpp_tokens_second = new Trend('llamacpp_tokens_second') const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second') +const llamacpp_emit_first_token_second = new Trend('llamacpp_emit_first_token_second') const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter') const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter') @@ -89,6 +90,9 @@ export default function () { ], "model": model, "stream": true, + "stream_options": { + "include_usage": true, // False to be supported in llama.cpp server + }, "seed": 42, "max_tokens": max_tokens, "stop": ["<|im_end|>"] // This is temporary for phi-2 base (i.e. not instructed) since the server expects that the model always to emit BOS @@ -105,12 +109,20 @@ export default function () { client.on('event', function (event) { if (promptEvalEndTime == null) { promptEvalEndTime = new Date() + llamacpp_emit_first_token_second.add((promptEvalEndTime - startTime) / 1.e3) + } + + if (event.data === '[DONE]' || event.data === '') { + return } let chunk = JSON.parse(event.data) - let choice = chunk.choices[0] - if (choice.finish_reason) { - finish_reason = choice.finish_reason + + if (chunk.choices && chunk.choices.length > 0) { + let choice = chunk.choices[0] + if (choice.finish_reason) { + finish_reason = choice.finish_reason + } } if (chunk.usage) {