server : clarify /slots endpoint, add is_processing (#10162)

* server : clarify /slots endpoint, add is_processing * fix tests
2025-01-26 12:21:40 +01:00 · 2024-11-04 16:33:29 +01:00 · 2024-11-04 16:33:29 +01:00 · 9e0ecfb697
commit 9e0ecfb697
parent 6a066b9978
3 changed files with 18 additions and 19 deletions
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -692,7 +692,10 @@ Given a ChatML-formatted json description in `messages`, it returns the predicte

 ### GET `/slots`: Returns the current slots processing state

-This endpoint can be disabled with `--no-slots`
+> [!WARNING]
+> This endpoint is intended for debugging and may be modified in future versions. For security reasons, we strongly advise against enabling it in production environments.
+
+This endpoint is disabled by default and can be enabled with `--slots`

 If query param `?fail_on_no_slot=1` is set, this endpoint will respond with status code 503 if there is no available slots.

@ -709,6 +712,7 @@ Example:
        "grammar": "",
        "id": 0,
        "ignore_eos": false,
+        "is_processing": false,
        "logit_bias": [],
        "min_p": 0.05000000074505806,
        "mirostat": 0,
@ -741,7 +745,6 @@ Example:
            "temperature"
        ],
        "seed": 42,
-        "state": 1,
        "stop": [
            "\n"
        ],
@ -755,10 +758,6 @@ Example:
 ]
 ```

-Possible values for `slot[i].state` are:
- `0`: SLOT_STATE_IDLE
- `1`: SLOT_STATE_PROCESSING
-
 ### GET `/metrics`: Prometheus compatible metrics exporter

 This endpoint is only accessible if `--metrics` is set.
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -1566,11 +1566,11 @@ struct server_context {

                    for (server_slot & slot : slots) {
                        json slot_data = get_formated_generation(slot);
-                        slot_data["id"]         = slot.id;
-                        slot_data["id_task"]    = slot.id_task;
-                        slot_data["state"]      = slot.state;
-                        slot_data["prompt"]     = common_detokenize(ctx, slot.prompt_tokens);
-                        slot_data["next_token"] = {
+                        slot_data["id"]            = slot.id;
+                        slot_data["id_task"]       = slot.id_task;
+                        slot_data["is_processing"] = slot.is_processing();
+                        slot_data["prompt"]        = common_detokenize(ctx, slot.prompt_tokens);
+                        slot_data["next_token"]    = {
                            {"has_next_token", slot.has_next_token},
                            {"has_new_line",   slot.has_new_line},
                            {"n_remain",       slot.n_remaining},
@ -1581,10 +1581,10 @@ struct server_context {
                            {"stopping_word",  slot.stopping_word},
                        };

-                        if (slot_data["state"] == SLOT_STATE_IDLE) {
-                            n_idle_slots++;
-                        } else {
+                        if (slot.is_processing()) {
                            n_processing_slots++;
+                        } else {
+                            n_idle_slots++;
                        }

                        slots_data.push_back(slot_data);
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@ -260,13 +260,13 @@ async def step_wait_for_server_status(context, expecting_status: Literal['health
 async def step_all_slots_status(context, expected_slot_status_string: Literal['idle', 'busy'] | str):
    match expected_slot_status_string:
        case 'idle':
-            expected_slot_status = 0
+            expected_slot_status = False
        case 'busy':
-            expected_slot_status = 1
+            expected_slot_status = True
        case _:
            assert False, "unknown status"

-    expected_slots = [{'id': slot_id, 'state': expected_slot_status}
+    expected_slots = [{'id': slot_id, 'is_processing': expected_slot_status}
                      for slot_id in range(context.n_slots)]
    await request_slots_status(context, expected_slots)

@ -1354,8 +1354,8 @@ async def wait_for_slots_status(context,
                if status_code == 503 and status_code == expected_http_status_code:
                    return
                if status_code == 200 and status_code == expected_http_status_code:
-                    n_slots_idle = sum(1 if slot["state"] == 0 else 0 for slot in slots)
-                    n_slots_processing = sum(1 if slot["state"] != 0 else 0 for slot in slots)
+                    n_slots_idle = sum(1 if not slot["is_processing"] else 0 for slot in slots)
+                    n_slots_processing = sum(1 if slot["is_processing"] else 0 for slot in slots)
                    if ((slots_idle is None or slots_idle == n_slots_idle)
                        and (slots_processing is None or slots_processing == n_slots_processing)):
                        return