server: bench: minor fixes (#10765)

* server/bench:
- support openAI streaming standard output with [DONE]\n\n
- export k6 raw results in csv
- fix too many tcp idle connection in tcp_wait
- add metric time to emit first token

* server/bench:
- fix when prometheus not started
- wait for server to be ready before starting bench
This commit is contained in:
Pierrick Hymbert 2025-01-02 18:06:12 +01:00 committed by GitHub
parent 0da5d86026
commit 2f0ee84b9b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 39 additions and 15 deletions

View File

@ -6,10 +6,10 @@ Benchmark is using [k6](https://k6.io/).
SSE is not supported by default in k6, you have to build k6 with the [xk6-sse](https://github.com/phymbert/xk6-sse) extension. SSE is not supported by default in k6, you have to build k6 with the [xk6-sse](https://github.com/phymbert/xk6-sse) extension.
Example: Example (assuming golang >= 1.21 is installed):
```shell ```shell
go install go.k6.io/xk6/cmd/xk6@latest go install go.k6.io/xk6/cmd/xk6@latest
xk6 build master \ $GOPATH/bin/xk6 build master \
--with github.com/phymbert/xk6-sse --with github.com/phymbert/xk6-sse
``` ```
@ -33,7 +33,7 @@ The server must answer OAI Chat completion requests on `http://localhost:8080/v1
Example: Example:
```shell ```shell
server --host localhost --port 8080 \ llama-server --host localhost --port 8080 \
--model ggml-model-q4_0.gguf \ --model ggml-model-q4_0.gguf \
--cont-batching \ --cont-batching \
--metrics \ --metrics \

View File

@ -189,12 +189,12 @@ xychart-beta
"pp": { "pp": {
"p95": round(data['metrics']["llamacpp_prompt_processing_second"]["p(95)"], 2), "p95": round(data['metrics']["llamacpp_prompt_processing_second"]["p(95)"], 2),
"avg": round(data['metrics']["llamacpp_prompt_processing_second"]["avg"], 2), "avg": round(data['metrics']["llamacpp_prompt_processing_second"]["avg"], 2),
"0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2), "0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2) if 'prompt_tokens_seconds' in prometheus_metrics else 0,
}, },
"tg": { "tg": {
"p95": round(data['metrics']["llamacpp_tokens_second"]["p(95)"], 2), "p95": round(data['metrics']["llamacpp_tokens_second"]["p(95)"], 2),
"avg": round(data['metrics']["llamacpp_tokens_second"]["avg"], 2), "avg": round(data['metrics']["llamacpp_tokens_second"]["avg"], 2),
"0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2), "0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2) if 'predicted_tokens_seconds' in prometheus_metrics else 0,
}, },
} }
with open("results.github.env", 'a') as github_env: with open("results.github.env", 'a') as github_env:
@ -214,11 +214,14 @@ def start_benchmark(args):
k6_args = [ k6_args = [
'run', args.scenario, 'run', args.scenario,
'--no-color', '--no-color',
'--no-connection-reuse',
'--no-vu-connection-reuse',
] ]
k6_args.extend(['--duration', args.duration]) k6_args.extend(['--duration', args.duration])
k6_args.extend(['--iterations', args.n_prompts]) k6_args.extend(['--iterations', args.n_prompts])
k6_args.extend(['--vus', args.parallel]) k6_args.extend(['--vus', args.parallel])
k6_args.extend(['--summary-export', 'k6-results.json']) k6_args.extend(['--summary-export', 'k6-results.json'])
k6_args.extend(['--out', 'csv=k6-results.csv'])
args = f"SERVER_BENCH_N_PROMPTS={args.n_prompts} SERVER_BENCH_MAX_PROMPT_TOKENS={args.max_prompt_tokens} SERVER_BENCH_MAX_CONTEXT={args.max_tokens} " args = f"SERVER_BENCH_N_PROMPTS={args.n_prompts} SERVER_BENCH_MAX_PROMPT_TOKENS={args.max_prompt_tokens} SERVER_BENCH_MAX_CONTEXT={args.max_tokens} "
args = args + ' '.join([str(arg) for arg in [k6_path, *k6_args]]) args = args + ' '.join([str(arg) for arg in [k6_path, *k6_args]])
print(f"bench: starting k6 with: {args}") print(f"bench: starting k6 with: {args}")
@ -231,7 +234,7 @@ def start_server(args):
server_process = start_server_background(args) server_process = start_server_background(args)
attempts = 0 attempts = 0
max_attempts = 20 max_attempts = 600
if 'GITHUB_ACTIONS' in os.environ: if 'GITHUB_ACTIONS' in os.environ:
max_attempts *= 2 max_attempts *= 2
@ -242,7 +245,15 @@ def start_server(args):
print(f"bench: waiting for server to start ...") print(f"bench: waiting for server to start ...")
time.sleep(0.5) time.sleep(0.5)
print("bench: server started.") attempts = 0
while not is_server_ready(args.host, args.port):
attempts += 1
if attempts > max_attempts:
assert False, "server not ready"
print(f"bench: waiting for server to be ready ...")
time.sleep(0.5)
print("bench: server started and ready.")
return server_process return server_process
@ -255,11 +266,6 @@ def start_server_background(args):
'--host', args.host, '--host', args.host,
'--port', args.port, '--port', args.port,
] ]
model_file = args.model_path_prefix + os.path.sep + args.hf_file
model_dir = os.path.dirname(model_file)
if not os.path.exists(model_dir):
os.makedirs(model_dir)
server_args.extend(['--model', model_file])
server_args.extend(['--hf-repo', args.hf_repo]) server_args.extend(['--hf-repo', args.hf_repo])
server_args.extend(['--hf-file', args.hf_file]) server_args.extend(['--hf-file', args.hf_file])
server_args.extend(['--n-gpu-layers', args.n_gpu_layers]) server_args.extend(['--n-gpu-layers', args.n_gpu_layers])
@ -303,6 +309,12 @@ def is_server_listening(server_fqdn, server_port):
return _is_server_listening return _is_server_listening
def is_server_ready(server_fqdn, server_port):
url = f"http://{server_fqdn}:{server_port}/health"
response = requests.get(url)
return response.status_code == 200
def escape_metric_name(metric_name): def escape_metric_name(metric_name):
return re.sub('[^A-Z0-9]', '_', metric_name.upper()) return re.sub('[^A-Z0-9]', '_', metric_name.upper())

View File

@ -56,6 +56,7 @@ const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens')
const llamacpp_tokens_second = new Trend('llamacpp_tokens_second') const llamacpp_tokens_second = new Trend('llamacpp_tokens_second')
const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second') const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second')
const llamacpp_emit_first_token_second = new Trend('llamacpp_emit_first_token_second')
const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter') const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter') const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
@ -89,6 +90,9 @@ export default function () {
], ],
"model": model, "model": model,
"stream": true, "stream": true,
"stream_options": {
"include_usage": true, // False to be supported in llama.cpp server
},
"seed": 42, "seed": 42,
"max_tokens": max_tokens, "max_tokens": max_tokens,
"stop": ["<|im_end|>"] // This is temporary for phi-2 base (i.e. not instructed) since the server expects that the model always to emit BOS "stop": ["<|im_end|>"] // This is temporary for phi-2 base (i.e. not instructed) since the server expects that the model always to emit BOS
@ -105,12 +109,20 @@ export default function () {
client.on('event', function (event) { client.on('event', function (event) {
if (promptEvalEndTime == null) { if (promptEvalEndTime == null) {
promptEvalEndTime = new Date() promptEvalEndTime = new Date()
llamacpp_emit_first_token_second.add((promptEvalEndTime - startTime) / 1.e3)
}
if (event.data === '[DONE]' || event.data === '') {
return
} }
let chunk = JSON.parse(event.data) let chunk = JSON.parse(event.data)
let choice = chunk.choices[0]
if (choice.finish_reason) { if (chunk.choices && chunk.choices.length > 0) {
finish_reason = choice.finish_reason let choice = chunk.choices[0]
if (choice.finish_reason) {
finish_reason = choice.finish_reason
}
} }
if (chunk.usage) { if (chunk.usage) {