server: ci: windows build and tests (#5968)

* server: ci: windows build and tests

* server: ci: remove tmp push branch

* server: ci: EOF EOL

* Use builti

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* server: tests: server graceful shutdown, then kill, then hard kill

* server: tests: remove python2 unicode string

* server: tests: remove wrong comment on server starting,  close_fds is always true

* server: tests: server kill, if pid exists

* server: tests: remove dependency to killall

* server: tests: ci windows: pid exists better handling

---------

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>
This commit is contained in:
Pierrick Hymbert 2024-03-10 18:17:47 +01:00 committed by GitHub
parent bcebd7dbf6
commit fa8a809a91
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 191 additions and 99 deletions

View File

@ -47,6 +47,8 @@ jobs:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v3 uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Dependencies - name: Dependencies
id: depends id: depends
@ -58,7 +60,6 @@ jobs:
cmake \ cmake \
python3-pip \ python3-pip \
wget \ wget \
psmisc \
language-pack-en language-pack-en
- name: Build - name: Build
@ -90,3 +91,46 @@ jobs:
run: | run: |
cd examples/server/tests cd examples/server/tests
PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow
server-windows:
runs-on: windows-latest
steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Build
id: cmake_build
run: |
mkdir build
cd build
cmake .. -DLLAMA_BUILD_SERVER=ON -DCMAKE_BUILD_TYPE=Release ;
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server
- name: Python setup
id: setup_python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Tests dependencies
id: test_dependencies
run: |
pip install -r examples/server/tests/requirements.txt
- name: Tests
id: server_integration_tests
run: |
cd examples/server/tests
behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
- name: Slow tests
id: server_integration_tests_slow
if: ${{ github.event.schedule != '' || github.event.inputs.slow_tests == 'true' }}
run: |
cd examples/server/tests
behave.exe --stop --no-skipped --no-capture --tags slow

View File

@ -1,9 +1,10 @@
import errno
import os import os
import socket import socket
import subprocess import subprocess
import time import time
from contextlib import closing from contextlib import closing
from signal import SIGKILL import signal
def before_scenario(context, scenario): def before_scenario(context, scenario):
@ -29,41 +30,68 @@ def after_scenario(context, scenario):
for line in f: for line in f:
print(line) print(line)
if not is_server_listening(context.server_fqdn, context.server_port): if not is_server_listening(context.server_fqdn, context.server_port):
print("\x1b[33;101mERROR: Server stopped listening\x1b[0m") print("\x1b[33;101mERROR: Server stopped listening\x1b[0m\n")
if not pid_exists(context.server_process.pid): if not pid_exists(context.server_process.pid):
assert False, f"Server not running pid={context.server_process.pid} ..." assert False, f"Server not running pid={context.server_process.pid} ..."
print(f"stopping server pid={context.server_process.pid} ...") server_graceful_shutdown(context)
context.server_process.kill()
# Wait few for socket to free up # Wait few for socket to free up
time.sleep(0.05) time.sleep(0.05)
attempts = 0 attempts = 0
while is_server_listening(context.server_fqdn, context.server_port): while pid_exists(context.server_process.pid) or is_server_listening(context.server_fqdn, context.server_port):
print(f"stopping server pid={context.server_process.pid} ...") server_kill(context)
os.kill(context.server_process.pid, SIGKILL)
time.sleep(0.1) time.sleep(0.1)
attempts += 1 attempts += 1
if attempts > 5: if attempts > 5:
print(f"Server dangling exits, killing all {context.server_path} ...") server_kill_hard(context)
process = subprocess.run(['killall', '-9', context.server_path],
stderr=subprocess.PIPE,
universal_newlines=True) def server_graceful_shutdown(context):
print(f"shutting down server pid={context.server_process.pid} ...\n")
if os.name == 'nt':
os.kill(context.server_process.pid, signal.CTRL_C_EVENT)
else:
os.kill(context.server_process.pid, signal.SIGINT)
def server_kill(context):
print(f"killing server pid={context.server_process.pid} ...\n")
context.server_process.kill()
def server_kill_hard(context):
pid = context.server_process.pid
path = context.server_path
print(f"Server dangling exits, hard killing force {pid}={path}...\n")
if os.name == 'nt':
process = subprocess.check_output(['taskkill', '/F', '/pid', str(pid)]).decode()
print(process) print(process)
else:
os.kill(-pid, signal.SIGKILL)
def is_server_listening(server_fqdn, server_port): def is_server_listening(server_fqdn, server_port):
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock: with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
result = sock.connect_ex((server_fqdn, server_port)) result = sock.connect_ex((server_fqdn, server_port))
return result == 0 _is_server_listening = result == 0
if _is_server_listening:
print(f"server is listening on {server_fqdn}:{server_port}...\n")
return _is_server_listening
def pid_exists(pid): def pid_exists(pid):
"""Check whether pid exists in the current process table.""" """Check whether pid exists in the current process table."""
import errno
if pid < 0: if pid < 0:
return False return False
if os.name == 'nt':
output = subprocess.check_output(['TASKLIST', '/FI', f'pid eq {pid}']).decode()
print(output)
return "No tasks are running" not in output
else:
try: try:
os.kill(pid, 0) os.kill(pid, 0)
except OSError as e: except OSError as e:

View File

@ -47,7 +47,7 @@ Feature: llama.cpp server
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
""" """
And a completion request with no api error And a completion request with no api error
Then 64 tokens are predicted matching fun|Annaks|popcorns Then 64 tokens are predicted matching fun|Annaks|popcorns|pictry
And the completion is truncated And the completion is truncated
And 109 prompt tokens are processed And 109 prompt tokens are processed

View File

@ -18,7 +18,7 @@ from huggingface_hub import hf_hub_download
from prometheus_client import parser from prometheus_client import parser
@step(u"a server listening on {server_fqdn}:{server_port}") @step("a server listening on {server_fqdn}:{server_port}")
def step_server_config(context, server_fqdn, server_port): def step_server_config(context, server_fqdn, server_port):
context.server_fqdn = server_fqdn context.server_fqdn = server_fqdn
context.server_port = int(server_port) context.server_port = int(server_port)
@ -57,24 +57,24 @@ def step_server_config(context, server_fqdn, server_port):
context.prompts = [] context.prompts = []
@step(u'a model file {hf_file} from HF repo {hf_repo}') @step('a model file {hf_file} from HF repo {hf_repo}')
def step_download_hf_model(context, hf_file, hf_repo): def step_download_hf_model(context, hf_file, hf_repo):
context.model_file = hf_hub_download(repo_id=hf_repo, filename=hf_file) context.model_file = hf_hub_download(repo_id=hf_repo, filename=hf_file)
if context.debug: if context.debug:
print(f"model file: {context.model_file}\n") print(f"model file: {context.model_file}\n")
@step(u'a model alias {model_alias}') @step('a model alias {model_alias}')
def step_model_alias(context, model_alias): def step_model_alias(context, model_alias):
context.model_alias = model_alias context.model_alias = model_alias
@step(u'{seed:d} as server seed') @step('{seed:d} as server seed')
def step_seed(context, seed): def step_seed(context, seed):
context.server_seed = seed context.server_seed = seed
@step(u'{ngl:d} GPU offloaded layers') @step('{ngl:d} GPU offloaded layers')
def step_n_gpu_layer(context, ngl): def step_n_gpu_layer(context, ngl):
if 'N_GPU_LAYERS' in os.environ: if 'N_GPU_LAYERS' in os.environ:
new_ngl = int(os.environ['N_GPU_LAYERS']) new_ngl = int(os.environ['N_GPU_LAYERS'])
@ -84,37 +84,37 @@ def step_n_gpu_layer(context, ngl):
context.n_gpu_layer = ngl context.n_gpu_layer = ngl
@step(u'{n_ctx:d} KV cache size') @step('{n_ctx:d} KV cache size')
def step_n_ctx(context, n_ctx): def step_n_ctx(context, n_ctx):
context.n_ctx = n_ctx context.n_ctx = n_ctx
@step(u'{n_slots:d} slots') @step('{n_slots:d} slots')
def step_n_slots(context, n_slots): def step_n_slots(context, n_slots):
context.n_slots = n_slots context.n_slots = n_slots
@step(u'{n_predict:d} server max tokens to predict') @step('{n_predict:d} server max tokens to predict')
def step_server_n_predict(context, n_predict): def step_server_n_predict(context, n_predict):
context.n_server_predict = n_predict context.n_server_predict = n_predict
@step(u'continuous batching') @step('continuous batching')
def step_server_continuous_batching(context): def step_server_continuous_batching(context):
context.server_continuous_batching = True context.server_continuous_batching = True
@step(u'embeddings extraction') @step('embeddings extraction')
def step_server_embeddings(context): def step_server_embeddings(context):
context.server_embeddings = True context.server_embeddings = True
@step(u'prometheus compatible metrics exposed') @step('prometheus compatible metrics exposed')
def step_server_metrics(context): def step_server_metrics(context):
context.server_metrics = True context.server_metrics = True
@step(u"the server is starting") @step("the server is starting")
def step_start_server(context): def step_start_server(context):
start_server_background(context) start_server_background(context)
attempts = 0 attempts = 0
@ -131,7 +131,7 @@ def step_start_server(context):
time.sleep(0.1) time.sleep(0.1)
@step(u"the server is {expecting_status}") @step("the server is {expecting_status}")
@async_run_until_complete @async_run_until_complete
async def step_wait_for_the_server_to_be_started(context, expecting_status): async def step_wait_for_the_server_to_be_started(context, expecting_status):
match expecting_status: match expecting_status:
@ -160,7 +160,7 @@ async def step_wait_for_the_server_to_be_started(context, expecting_status):
assert False, "unknown status" assert False, "unknown status"
@step(u'all slots are {expected_slot_status_string}') @step('all slots are {expected_slot_status_string}')
@async_run_until_complete @async_run_until_complete
async def step_all_slots_status(context, expected_slot_status_string): async def step_all_slots_status(context, expected_slot_status_string):
match expected_slot_status_string: match expected_slot_status_string:
@ -176,7 +176,7 @@ async def step_all_slots_status(context, expected_slot_status_string):
await request_slots_status(context, expected_slots) await request_slots_status(context, expected_slots)
@step(u'a completion request with {api_error} api error') @step('a completion request with {api_error} api error')
@async_run_until_complete @async_run_until_complete
async def step_request_completion(context, api_error): async def step_request_completion(context, api_error):
expect_api_error = api_error == 'raised' expect_api_error = api_error == 'raised'
@ -194,133 +194,133 @@ async def step_request_completion(context, api_error):
assert completion == 401, f"completion must be an 401 status code: {completion}" assert completion == 401, f"completion must be an 401 status code: {completion}"
@step(u'{predicted_n:d} tokens are predicted matching {re_content}') @step('{predicted_n:d} tokens are predicted matching {re_content}')
def step_n_tokens_predicted_with_content(context, predicted_n, re_content): def step_n_tokens_predicted_with_content(context, predicted_n, re_content):
context.completion = context.tasks_result.pop() context.completion = context.tasks_result.pop()
assert_n_tokens_predicted(context.completion, predicted_n, re_content) assert_n_tokens_predicted(context.completion, predicted_n, re_content)
@step(u'{predicted_n:d} tokens are predicted') @step('{predicted_n:d} tokens are predicted')
def step_n_tokens_predicted(context, predicted_n): def step_n_tokens_predicted(context, predicted_n):
context.completion = context.tasks_result.pop() context.completion = context.tasks_result.pop()
assert_n_tokens_predicted(context.completion, predicted_n) assert_n_tokens_predicted(context.completion, predicted_n)
@step(u'the completion is truncated') @step('the completion is truncated')
def step_assert_completion_truncated(context): def step_assert_completion_truncated(context):
step_assert_completion_truncated(context, '') step_assert_completion_truncated(context, '')
@step(u'the completion is {truncated} truncated') @step('the completion is {truncated} truncated')
def step_assert_completion_truncated(context, truncated): def step_assert_completion_truncated(context, truncated):
truncated = truncated != "not" truncated = truncated != "not"
assert context.completion['truncated'] == truncated, f'{context.completion}' assert context.completion['truncated'] == truncated, f'{context.completion}'
@step(u'{n_prompt:d} prompt tokens are processed') @step('{n_prompt:d} prompt tokens are processed')
def step_impl(context, n_prompt): def step_impl(context, n_prompt):
assert n_prompt < 0 or n_prompt == context.completion['timings']['prompt_n'], f"n_prompt={context.completion['timings']['prompt_n']}" assert n_prompt < 0 or n_prompt == context.completion['timings']['prompt_n'], f"n_prompt={context.completion['timings']['prompt_n']}"
@step(u'a user prompt {user_prompt}') @step('a user prompt {user_prompt}')
def step_user_prompt(context, user_prompt): def step_user_prompt(context, user_prompt):
context.prompts.append(user_prompt) context.prompts.append(user_prompt)
context.n_prompts = len(context.prompts) context.n_prompts = len(context.prompts)
@step(u'a system prompt {system_prompt}') @step('a system prompt {system_prompt}')
def step_system_prompt(context, system_prompt): def step_system_prompt(context, system_prompt):
context.system_prompt = system_prompt context.system_prompt = system_prompt
@step(u'a model {model}') @step('a model {model}')
def step_model(context, model): def step_model(context, model):
context.model = model context.model = model
@step(u'{max_tokens:d} max tokens to predict') @step('{max_tokens:d} max tokens to predict')
def step_max_tokens(context, max_tokens): def step_max_tokens(context, max_tokens):
context.n_predict = max_tokens context.n_predict = max_tokens
@step(u'streaming is {enable_streaming}') @step('streaming is {enable_streaming}')
def step_streaming(context, enable_streaming): def step_streaming(context, enable_streaming):
context.enable_streaming = enable_streaming == 'enabled' context.enable_streaming = enable_streaming == 'enabled'
@step(u'a user api key {user_api_key}') @step('a user api key {user_api_key}')
def step_user_api_key(context, user_api_key): def step_user_api_key(context, user_api_key):
context.user_api_key = user_api_key context.user_api_key = user_api_key
@step(u'no user api key') @step('no user api key')
def step_no_user_api_key(context): def step_no_user_api_key(context):
context.user_api_key = None context.user_api_key = None
@step(u'a user api key ') @step('a user api key ')
def step_no_user_api_key_space(context): def step_no_user_api_key_space(context):
context.user_api_key = None context.user_api_key = None
@step(u'a server api key {server_api_key}') @step('a server api key {server_api_key}')
def step_server_api_key(context, server_api_key): def step_server_api_key(context, server_api_key):
context.server_api_key = server_api_key context.server_api_key = server_api_key
@step(u'{n_junk:d} as number of junk') @step('{n_junk:d} as number of junk')
def step_n_junk(context, n_junk): def step_n_junk(context, n_junk):
context.n_junk = n_junk context.n_junk = n_junk
@step(u'{n_batch:d} as batch size') @step('{n_batch:d} as batch size')
def step_n_batch(context, n_batch): def step_n_batch(context, n_batch):
context.n_batch = n_batch context.n_batch = n_batch
@step(u'{seed:d} as seed') @step('{seed:d} as seed')
def step_seed(context, seed): def step_seed(context, seed):
context.seed = seed context.seed = seed
@step(u'a prefix prompt') @step('a prefix prompt')
def step_prompt_prefix(context): def step_prompt_prefix(context):
context.prompt_prefix = context.text context.prompt_prefix = context_text(context)
@step(u'a junk suffix prompt') @step('a junk suffix prompt')
def step_prompt_junk_suffix(context): def step_prompt_junk_suffix(context):
context.prompt_junk_suffix = context.text context.prompt_junk_suffix = context_text(context)
@step(u'a suffix prompt') @step('a suffix prompt')
def step_prompt_suffix(context): def step_prompt_suffix(context):
context.prompt_suffix = context.text context.prompt_suffix = context_text(context)
@step(u'{n_ga:d} group attention factor' @step('{n_ga:d} group attention factor'
u' to extend context size through self-extend') ' to extend context size through self-extend')
def step_impl(context, n_ga): def step_impl(context, n_ga):
context.n_ga = n_ga context.n_ga = n_ga
@step(u'{n_ga_w:d} group attention width to extend context size through self-extend') @step('{n_ga_w:d} group attention width to extend context size through self-extend')
def step_impl(context, n_ga_w): def step_impl(context, n_ga_w):
context.n_ga_w = n_ga_w context.n_ga_w = n_ga_w
@step(u'a passkey prompt template') @step('a passkey prompt template')
def step_prompt_passkey(context): def step_prompt_passkey(context):
context.prompt_passkey = context.text context.prompt_passkey = context_text(context)
@step(u'{n_prompts:d} fixed prompts') @step('{n_prompts:d} fixed prompts')
def step_fixed_prompts(context, n_prompts): def step_fixed_prompts(context, n_prompts):
context.prompts.extend([str(0)*(context.n_batch if context.n_batch is not None else 512) for i in range(n_prompts)]) context.prompts.extend([str(0)*(context.n_batch if context.n_batch is not None else 512) for i in range(n_prompts)])
context.n_prompts = n_prompts context.n_prompts = n_prompts
@step(u'a "{passkey}" passkey challenge prompt with the passkey inserted every {i_pos:d} junk') @step('a "{passkey}" passkey challenge prompt with the passkey inserted every {i_pos:d} junk')
def step_prompt_passkey(context, passkey, i_pos): def step_prompt_passkey(context, passkey, i_pos):
prompt = "" prompt = ""
for i in range(context.n_junk): for i in range(context.n_junk):
@ -334,7 +334,7 @@ def step_prompt_passkey(context, passkey, i_pos):
context.n_prompts = len(context.prompts) context.n_prompts = len(context.prompts)
@step(u'an OAI compatible chat completions request with {api_error} api error') @step('an OAI compatible chat completions request with {api_error} api error')
@async_run_until_complete @async_run_until_complete
async def step_oai_chat_completions(context, api_error): async def step_oai_chat_completions(context, api_error):
if context.debug: if context.debug:
@ -369,19 +369,19 @@ async def step_oai_chat_completions(context, api_error):
print(f"Completion response: {completion}") print(f"Completion response: {completion}")
@step(u'a prompt') @step('a prompt')
def step_a_prompt(context): def step_a_prompt(context):
context.prompts.append(context.text) context.prompts.append(context_text(context))
context.n_prompts = len(context.prompts) context.n_prompts = len(context.prompts)
@step(u'a prompt {prompt}') @step('a prompt {prompt}')
def step_a_prompt_prompt(context, prompt): def step_a_prompt_prompt(context, prompt):
context.prompts.append(prompt) context.prompts.append(prompt)
context.n_prompts = len(context.prompts) context.n_prompts = len(context.prompts)
@step(u'concurrent completion requests') @step('concurrent completion requests')
@async_run_until_complete() @async_run_until_complete()
async def step_concurrent_completion_requests(context): async def step_concurrent_completion_requests(context):
await concurrent_requests(context, await concurrent_requests(context,
@ -397,7 +397,7 @@ async def step_concurrent_completion_requests(context):
'user_api_key') else None) 'user_api_key') else None)
@step(u'concurrent OAI completions requests') @step('concurrent OAI completions requests')
@async_run_until_complete @async_run_until_complete
async def step_oai_chat_completions(context): async def step_oai_chat_completions(context):
await concurrent_requests(context, oai_chat_completions, await concurrent_requests(context, oai_chat_completions,
@ -417,7 +417,7 @@ async def step_oai_chat_completions(context):
if hasattr(context, 'user_api_key') else None) if hasattr(context, 'user_api_key') else None)
@step(u'concurrent OAI completions requests no v1') @step('concurrent OAI completions requests no v1')
@async_run_until_complete @async_run_until_complete
async def step_oai_chat_completions(context): async def step_oai_chat_completions(context):
await concurrent_requests(context, oai_chat_completions, await concurrent_requests(context, oai_chat_completions,
@ -440,13 +440,13 @@ async def step_oai_chat_completions(context):
if hasattr(context, 'user_api_key') else None) if hasattr(context, 'user_api_key') else None)
@step(u'all prompts are predicted') @step('all prompts are predicted')
@async_run_until_complete @async_run_until_complete
async def step_all_prompts_are_predicted(context): async def step_all_prompts_are_predicted(context):
await all_prompts_are_predicted(context) await all_prompts_are_predicted(context)
@step(u'all prompts are predicted with {n_expected_predicted:d} tokens') @step('all prompts are predicted with {n_expected_predicted:d} tokens')
@async_run_until_complete @async_run_until_complete
async def step_all_prompts_are_predicted_with_n_tokens(context, n_expected_predicted): async def step_all_prompts_are_predicted_with_n_tokens(context, n_expected_predicted):
await all_prompts_are_predicted(context, n_expected_predicted) await all_prompts_are_predicted(context, n_expected_predicted)
@ -460,14 +460,14 @@ async def all_prompts_are_predicted(context, expected_predicted_n=None):
assert len(context.concurrent_tasks) == 0, f"{len(context.concurrent_tasks)} pending requests" assert len(context.concurrent_tasks) == 0, f"{len(context.concurrent_tasks)} pending requests"
@step(u'embeddings are computed for') @step('embeddings are computed for')
@async_run_until_complete @async_run_until_complete
async def step_compute_embedding(context): async def step_compute_embedding(context):
context.n_prompts = 1 context.n_prompts = 1
context.embeddings = await request_embedding(context.text, base_url=context.base_url) context.embeddings = await request_embedding(context_text(context), base_url=context.base_url)
@step(u'all embeddings are the same') @step('all embeddings are the same')
@async_run_until_complete @async_run_until_complete
async def step_all_embeddings_are_the_same(context): async def step_all_embeddings_are_the_same(context):
n_embedding_requests = await gather_tasks_results(context) n_embedding_requests = await gather_tasks_results(context)
@ -491,7 +491,8 @@ async def step_all_embeddings_are_the_same(context):
print(f"{msg}\n") print(f"{msg}\n")
assert np.isclose(similarity, 1.0, rtol=1e-05, atol=1e-08, equal_nan=False), msg assert np.isclose(similarity, 1.0, rtol=1e-05, atol=1e-08, equal_nan=False), msg
@step(u'embeddings are generated')
@step('embeddings are generated')
def step_assert_embeddings(context): def step_assert_embeddings(context):
assert context.n_prompts == len(context.embeddings), (f"unexpected response:\n" assert context.n_prompts == len(context.embeddings), (f"unexpected response:\n"
f"context.n_prompts={context.n_prompts}\n" f"context.n_prompts={context.n_prompts}\n"
@ -500,17 +501,17 @@ def step_assert_embeddings(context):
assert_embeddings(embedding) assert_embeddings(embedding)
@step(u'an OAI compatible embeddings computation request for') @step('an OAI compatible embeddings computation request for')
@async_run_until_complete @async_run_until_complete
async def step_oai_compute_embeddings(context): async def step_oai_compute_embeddings(context):
context.n_prompts = 1 context.n_prompts = 1
context.embeddings = await request_oai_embeddings(context.text, context.embeddings = await request_oai_embeddings(context_text(context),
base_url=context.base_url, base_url=context.base_url,
user_api_key=context.user_api_key, user_api_key=context.user_api_key,
model=context.model) model=context.model)
@step(u'an OAI compatible embeddings computation request for multiple inputs') @step('an OAI compatible embeddings computation request for multiple inputs')
@async_run_until_complete @async_run_until_complete
async def step_oai_compute_embeddings_multiple_inputs(context): async def step_oai_compute_embeddings_multiple_inputs(context):
context.embeddings = await request_oai_embeddings(context.prompts, context.embeddings = await request_oai_embeddings(context.prompts,
@ -520,7 +521,7 @@ async def step_oai_compute_embeddings_multiple_inputs(context):
context.prompts.clear() context.prompts.clear()
@step(u'concurrent embedding requests') @step('concurrent embedding requests')
@async_run_until_complete() @async_run_until_complete()
async def step_concurrent_embedding_requests(context): async def step_concurrent_embedding_requests(context):
await concurrent_requests(context, await concurrent_requests(context,
@ -529,7 +530,7 @@ async def step_concurrent_embedding_requests(context):
base_url=context.base_url) base_url=context.base_url)
@step(u'concurrent OAI embedding requests') @step('concurrent OAI embedding requests')
@async_run_until_complete() @async_run_until_complete()
async def step_concurrent_oai_embedding_requests(context): async def step_concurrent_oai_embedding_requests(context):
await concurrent_requests(context, await concurrent_requests(context,
@ -540,7 +541,7 @@ async def step_concurrent_oai_embedding_requests(context):
model=context.model) model=context.model)
@step(u'all embeddings are generated') @step('all embeddings are generated')
@async_run_until_complete() @async_run_until_complete()
async def all_embeddings_are_generated(context): async def all_embeddings_are_generated(context):
n_embedding_requests = await gather_tasks_results(context) n_embedding_requests = await gather_tasks_results(context)
@ -549,10 +550,10 @@ async def all_embeddings_are_generated(context):
assert_embeddings(context.tasks_result.pop().pop()) assert_embeddings(context.tasks_result.pop().pop())
@step(u'tokenizing') @step('tokenizing')
@async_run_until_complete @async_run_until_complete
async def step_tokenize(context): async def step_tokenize(context):
context.tokenized_text = context.text context.tokenized_text = context_text(context)
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession() as session:
async with session.post(f'{context.base_url}/tokenize', async with session.post(f'{context.base_url}/tokenize',
json={ json={
@ -563,7 +564,7 @@ async def step_tokenize(context):
context.tokens = tokenize_json['tokens'] context.tokens = tokenize_json['tokens']
@step(u'tokens can be detokenize') @step('tokens can be detokenize')
@async_run_until_complete @async_run_until_complete
async def step_detokenize(context): async def step_detokenize(context):
assert len(context.tokens) > 0 assert len(context.tokens) > 0
@ -578,7 +579,7 @@ async def step_detokenize(context):
assert context.tokenized_text == detokenize_json['content'].strip() assert context.tokenized_text == detokenize_json['content'].strip()
@step(u'an OPTIONS request is sent from {origin}') @step('an OPTIONS request is sent from {origin}')
@async_run_until_complete @async_run_until_complete
async def step_options_request(context, origin): async def step_options_request(context, origin):
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession() as session:
@ -589,12 +590,12 @@ async def step_options_request(context, origin):
context.options_response = response context.options_response = response
@step(u'CORS header {cors_header} is set to {cors_header_value}') @step('CORS header {cors_header} is set to {cors_header_value}')
def step_check_options_header_value(context, cors_header, cors_header_value): def step_check_options_header_value(context, cors_header, cors_header_value):
assert context.options_response.headers[cors_header] == cors_header_value assert context.options_response.headers[cors_header] == cors_header_value
@step(u'prometheus metrics are exposed') @step('prometheus metrics are exposed')
@async_run_until_complete @async_run_until_complete
async def step_prometheus_metrics_exported(context): async def step_prometheus_metrics_exported(context):
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession() as session:
@ -616,14 +617,14 @@ async def step_prometheus_metrics_exported(context):
assert metric_exported, "No metrics exported" assert metric_exported, "No metrics exported"
@step(u'metric {metric_name} is {metric_value:d}') @step('metric {metric_name} is {metric_value:d}')
def step_assert_metric_value(context, metric_name, metric_value): def step_assert_metric_value(context, metric_name, metric_value):
if metric_name not in context.metrics: if metric_name not in context.metrics:
assert False, f"no metric {metric_name} in {context.metrics.keys()}" assert False, f"no metric {metric_name} in {context.metrics.keys()}"
assert context.metrics[metric_name].samples[0].value == metric_value, f"metric: {context.metrics[metric_name]}" assert context.metrics[metric_name].samples[0].value == metric_value, f"metric: {context.metrics[metric_name]}"
@step(u'available models') @step('available models')
def step_available_models(context): def step_available_models(context):
# openai client always expects an api_key # openai client always expects an api_key
openai.api_key = context.user_api_key if context.user_api_key is not None else 'nope' openai.api_key = context.user_api_key if context.user_api_key is not None else 'nope'
@ -631,14 +632,14 @@ def step_available_models(context):
context.models = openai.Model.list().data context.models = openai.Model.list().data
@step(u'{n_model:d} models are supported') @step('{n_model:d} models are supported')
def step_supported_models(context, n_model): def step_supported_models(context, n_model):
if context.debug: if context.debug:
print("server models available:", context.models) print("server models available:", context.models)
assert len(context.models) == n_model assert len(context.models) == n_model
@step(u'model {i_model:d} is {param} {preposition} {param_value}') @step('model {i_model:d} is {param} {preposition} {param_value}')
def step_supported_models(context, i_model, param, preposition, param_value): def step_supported_models(context, i_model, param, preposition, param_value):
assert i_model < len(context.models) assert i_model < len(context.models)
model = context.models[i_model] model = context.models[i_model]
@ -1007,12 +1008,22 @@ async def completions_seed(context):
else context.server_seed if hasattr(context, 'server_seed') else None else context.server_seed if hasattr(context, 'server_seed') else None
def context_text(context):
return context.text.replace('\r', '')
def start_server_background(context): def start_server_background(context):
if os.name == 'nt':
context.server_path = '../../../build/bin/Release/server.exe'
else:
context.server_path = '../../../build/bin/server' context.server_path = '../../../build/bin/server'
if 'LLAMA_SERVER_BIN_PATH' in os.environ: if 'LLAMA_SERVER_BIN_PATH' in os.environ:
context.server_path = os.environ['LLAMA_SERVER_BIN_PATH'] context.server_path = os.environ['LLAMA_SERVER_BIN_PATH']
server_listen_addr = context.server_fqdn
if os.name == 'nt':
server_listen_addr = '0.0.0.0'
server_args = [ server_args = [
'--host', context.server_fqdn, '--host', server_listen_addr,
'--port', context.server_port, '--port', context.server_port,
'--model', context.model_file '--model', context.model_file
] ]
@ -1045,7 +1056,16 @@ def start_server_background(context):
if 'SERVER_LOG_FORMAT_JSON' not in os.environ: if 'SERVER_LOG_FORMAT_JSON' not in os.environ:
server_args.extend(['--log-format', "text"]) server_args.extend(['--log-format', "text"])
print(f"starting server with: {context.server_path} {server_args}\n") print(f"starting server with: {context.server_path} {server_args}\n")
flags = 0
if 'nt' == os.name:
flags |= subprocess.DETACHED_PROCESS
flags |= subprocess.CREATE_NEW_PROCESS_GROUP
flags |= subprocess.CREATE_NO_WINDOW
pkwargs = {
'creationflags': flags,
}
context.server_process = subprocess.Popen( context.server_process = subprocess.Popen(
[str(arg) for arg in [context.server_path, *server_args]], [str(arg) for arg in [context.server_path, *server_args]],
close_fds=True) **pkwargs)
print(f"server pid={context.server_process.pid}") print(f"server pid={context.server_process.pid}, behave pid={os.getpid()}")