server : clarify /slots endpoint, add is_processing (#10162)

author Xuan Son Nguyen <redacted>

Mon, 4 Nov 2024 15:33:29 +0000 (16:33 +0100)

committer GitHub <redacted>

Mon, 4 Nov 2024 15:33:29 +0000 (16:33 +0100)
author Xuan Son Nguyen <redacted>
Mon, 4 Nov 2024 15:33:29 +0000 (16:33 +0100)
committer GitHub <redacted>
Mon, 4 Nov 2024 15:33:29 +0000 (16:33 +0100)
diff --git a/examples/server/README.md b/examples/server/README.md

index 1629e456b68360cc607fa34dce1ebb539ae3a801..15f95db1e06e1e9717c16603906c5656f6d627af 100644 (file)
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -692,7 +692,10 @@ Given a ChatML-formatted json description in `messages`, it returns the predicte
  
  ### GET `/slots`: Returns the current slots processing state
  
-This endpoint can be disabled with `--no-slots`
+> [!WARNING]
+> This endpoint is intended for debugging and may be modified in future versions. For security reasons, we strongly advise against enabling it in production environments.
+
+This endpoint is disabled by default and can be enabled with `--slots`
  
  If query param `?fail_on_no_slot=1` is set, this endpoint will respond with status code 503 if there is no available slots.
  
@@ -709,6 +712,7 @@ Example:
          "grammar": "",
          "id": 0,
          "ignore_eos": false,
+        "is_processing": false,
          "logit_bias": [],
          "min_p": 0.05000000074505806,
          "mirostat": 0,
@@ -741,7 +745,6 @@ Example:
              "temperature"
          ],
          "seed": 42,
-        "state": 1,
          "stop": [
              "\n"
          ],
@@ -755,10 +758,6 @@ Example:
  ]
  ```
  
-Possible values for `slot[i].state` are:
-- `0`: SLOT_STATE_IDLE
-- `1`: SLOT_STATE_PROCESSING
-
  ### GET `/metrics`: Prometheus compatible metrics exporter
  
  This endpoint is only accessible if `--metrics` is set.
diff --git a/examples/server/server.cpp b/examples/server/server.cpp

index 8531a784ded3dae5a423d3e8eff9c0908afc8405..f0b89b22cd22da9a0f72d971ba68948e499457c2 100644 (file)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1566,11 +1566,11 @@ struct server_context {
  
                      for (server_slot & slot : slots) {
                          json slot_data = get_formated_generation(slot);
-                        slot_data["id"]         = slot.id;
-                        slot_data["id_task"]    = slot.id_task;
-                        slot_data["state"]      = slot.state;
-                        slot_data["prompt"]     = common_detokenize(ctx, slot.prompt_tokens);
-                        slot_data["next_token"] = {
+                        slot_data["id"]            = slot.id;
+                        slot_data["id_task"]       = slot.id_task;
+                        slot_data["is_processing"] = slot.is_processing();
+                        slot_data["prompt"]        = common_detokenize(ctx, slot.prompt_tokens);
+                        slot_data["next_token"]    = {
                              {"has_next_token", slot.has_next_token},
                              {"has_new_line",   slot.has_new_line},
                              {"n_remain",       slot.n_remaining},
@@ -1581,10 +1581,10 @@ struct server_context {
                              {"stopping_word",  slot.stopping_word},
                          };
  
-                        if (slot_data["state"] == SLOT_STATE_IDLE) {
-                            n_idle_slots++;
-                        } else {
+                        if (slot.is_processing()) {
                              n_processing_slots++;
+                        } else {
+                            n_idle_slots++;
                          }
  
                          slots_data.push_back(slot_data);
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py

index 2e418d8aa571b620e0c07b4a666da64926d630ac..687b163f487b6c1e028d7514d51074f3b90323b4 100644 (file)
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -260,13 +260,13 @@ async def step_wait_for_server_status(context, expecting_status: Literal['health
  async def step_all_slots_status(context, expected_slot_status_string: Literal['idle', 'busy'] | str):
      match expected_slot_status_string:
          case 'idle':
-            expected_slot_status = 0
+            expected_slot_status = False
          case 'busy':
-            expected_slot_status = 1
+            expected_slot_status = True
          case _:
              assert False, "unknown status"
  
-    expected_slots = [{'id': slot_id, 'state': expected_slot_status}
+    expected_slots = [{'id': slot_id, 'is_processing': expected_slot_status}
                        for slot_id in range(context.n_slots)]
      await request_slots_status(context, expected_slots)
  
@@ -1354,8 +1354,8 @@ async def wait_for_slots_status(context,
                  if status_code == 503 and status_code == expected_http_status_code:
                      return
                  if status_code == 200 and status_code == expected_http_status_code:
-                    n_slots_idle = sum(1 if slot["state"] == 0 else 0 for slot in slots)
-                    n_slots_processing = sum(1 if slot["state"] != 0 else 0 for slot in slots)
+                    n_slots_idle = sum(1 if not slot["is_processing"] else 0 for slot in slots)
+                    n_slots_processing = sum(1 if slot["is_processing"] else 0 for slot in slots)
                      if ((slots_idle is None or slots_idle == n_slots_idle)
                          and (slots_processing is None or slots_processing == n_slots_processing)):
                          return
author	Xuan Son Nguyen <redacted>
	Mon, 4 Nov 2024 15:33:29 +0000 (16:33 +0100)
committer	GitHub <redacted>
	Mon, 4 Nov 2024 15:33:29 +0000 (16:33 +0100)
examples/server/README.md		patch \| blob \| history
examples/server/server.cpp		patch \| blob \| history
examples/server/tests/features/steps/steps.py		patch \| blob \| history