server : clarify /slots endpoint, add is_processing (#10162)

* server : clarify /slots endpoint, add is_processing * fix tests
2024-12-27 06:39:25 +01:00 · 2024-11-04 16:33:29 +01:00 · 2024-11-04 16:33:29 +01:00 · 9e0ecfb697
commit 9e0ecfb697
parent 6a066b9978
3 changed files with 18 additions and 19 deletions
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -692,7 +692,10 @@ Given a ChatML-formatted json description in `messages`, it returns the predicte
 ### GET `/slots`: Returns the current slots processing state
-This endpoint can be disabled with `--no-slots`
+> [!WARNING]
 > This endpoint is intended for debugging and may be modified in future versions. For security reasons, we strongly advise against enabling it in production environments.
 This endpoint is disabled by default and can be enabled with `--slots`
 If query param `?fail_on_no_slot=1` is set, this endpoint will respond with status code 503 if there is no available slots.
@ -709,6 +712,7 @@ Example:
        "grammar": "",
        "id": 0,
        "ignore_eos": false,
        "is_processing": false,
        "logit_bias": [],
        "min_p": 0.05000000074505806,
        "mirostat": 0,
@ -741,7 +745,6 @@ Example:
            "temperature"
        ],
        "seed": 42,
        "state": 1,
        "stop": [
            "\n"
        ],
@ -755,10 +758,6 @@ Example:
 ]
 ```
 Possible values for `slot[i].state` are:
 - `0`: SLOT_STATE_IDLE
 - `1`: SLOT_STATE_PROCESSING
 ### GET `/metrics`: Prometheus compatible metrics exporter
 This endpoint is only accessible if `--metrics` is set.
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -1566,11 +1566,11 @@ struct server_context {
                    for (server_slot & slot : slots) {
                        json slot_data = get_formated_generation(slot);
-                        slot_data["id"]         = slot.id;
+                        slot_data["id"]            = slot.id;
-                        slot_data["id_task"]    = slot.id_task;
+                        slot_data["id_task"]       = slot.id_task;
-                        slot_data["state"]      = slot.state;
+                        slot_data["is_processing"] = slot.is_processing();
-                        slot_data["prompt"]     = common_detokenize(ctx, slot.prompt_tokens);
+                        slot_data["prompt"]        = common_detokenize(ctx, slot.prompt_tokens);
-                        slot_data["next_token"] = {
+                        slot_data["next_token"]    = {
                            {"has_next_token", slot.has_next_token},
                            {"has_new_line",   slot.has_new_line},
                            {"n_remain",       slot.n_remaining},
@ -1581,10 +1581,10 @@ struct server_context {
                            {"stopping_word",  slot.stopping_word},
                        };
-                        if (slot_data["state"] == SLOT_STATE_IDLE) {
+                        if (slot.is_processing()) {
                            n_idle_slots++;
                        } else {
                            n_processing_slots++;
                        } else {
                            n_idle_slots++;
                        }
                        slots_data.push_back(slot_data);
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@ -260,13 +260,13 @@ async def step_wait_for_server_status(context, expecting_status: Literal['health
 async def step_all_slots_status(context, expected_slot_status_string: Literal['idle', 'busy'] | str):
    match expected_slot_status_string:
        case 'idle':
-            expected_slot_status = 0
+            expected_slot_status = False
        case 'busy':
-            expected_slot_status = 1
+            expected_slot_status = True
        case _:
            assert False, "unknown status"
-    expected_slots = [{'id': slot_id, 'state': expected_slot_status}
+    expected_slots = [{'id': slot_id, 'is_processing': expected_slot_status}
                      for slot_id in range(context.n_slots)]
    await request_slots_status(context, expected_slots)
@ -1354,8 +1354,8 @@ async def wait_for_slots_status(context,
                if status_code == 503 and status_code == expected_http_status_code:
                    return
                if status_code == 200 and status_code == expected_http_status_code:
-                    n_slots_idle = sum(1 if slot["state"] == 0 else 0 for slot in slots)
+                    n_slots_idle = sum(1 if not slot["is_processing"] else 0 for slot in slots)
-                    n_slots_processing = sum(1 if slot["state"] != 0 else 0 for slot in slots)
+                    n_slots_processing = sum(1 if slot["is_processing"] else 0 for slot in slots)
                    if ((slots_idle is None or slots_idle == n_slots_idle)
                        and (slots_processing is None or slots_processing == n_slots_processing)):
                        return