server : enhanced health endpoint (#5548)

author Pierrick Hymbert <redacted>

Sun, 18 Feb 2024 16:31:28 +0000 (17:31 +0100)

committer GitHub <redacted>

Sun, 18 Feb 2024 16:31:28 +0000 (18:31 +0200)
author Pierrick Hymbert <redacted>
Sun, 18 Feb 2024 16:31:28 +0000 (17:31 +0100)
committer GitHub <redacted>
Sun, 18 Feb 2024 16:31:28 +0000 (18:31 +0200)
diff --git a/examples/server/README.md b/examples/server/README.md

index fe5cd8d5d138207a2f1e4671d625008edde145bf..5e3ae833bef57672040733c38af0e8e5a940e6e3 100644 (file)
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -136,6 +136,7 @@ node index.js
    - `{"status": "loading model"}` if the model is still being loaded.
    - `{"status": "error"}` if the model failed to load.
    - `{"status": "ok"}` if the model is successfully loaded and the server is ready for further requests mentioned below.
+  - `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available
  
  - **POST** `/completion`: Given a `prompt`, it returns the predicted completion.
  
diff --git a/examples/server/server.cpp b/examples/server/server.cpp

index 7aa706e9530798d9530915a7136838e726704d32..8145af867292b3841b2eba1639eed7dd2c64d4f6 100644 (file)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2578,8 +2578,35 @@ int main(int argc, char **argv)
          server_state current_state = state.load();
          switch(current_state) {
              case SERVER_STATE_READY:
-                res.set_content(R"({"status": "ok"})", "application/json");
-                res.status = 200; // HTTP OK
+                if (llama.all_slots_are_idle) {
+                    res.set_content(R"({"status": "ok"})", "application/json");
+                    res.status = 200; // HTTP OK
+                } else {
+                    int available_slots = 0;
+                    int processing_slots = 0;
+                    for (llama_client_slot & slot : llama.slots) {
+                        if (slot.available()) {
+                            available_slots++;
+                        } else {
+                            processing_slots++;
+                        }
+                    }
+                    if (available_slots > 0) {
+                        json health = {
+                                {"status",           "ok"},
+                                {"slots_idle",       available_slots},
+                                {"slots_processing", processing_slots}};
+                        res.set_content(health.dump(), "application/json");
+                        res.status = 200; // HTTP OK
+                    } else {
+                        json health = {
+                                {"status",           "no slot available"},
+                                {"slots_idle",       available_slots},
+                                {"slots_processing", processing_slots}};
+                        res.set_content(health.dump(), "application/json");
+                        res.status = 503; // HTTP Service Unavailable
+                    }
+                }
                  break;
              case SERVER_STATE_LOADING_MODEL:
                  res.set_content(R"({"status": "loading model"})", "application/json");
author	Pierrick Hymbert <redacted>
	Sun, 18 Feb 2024 16:31:28 +0000 (17:31 +0100)
committer	GitHub <redacted>
	Sun, 18 Feb 2024 16:31:28 +0000 (18:31 +0200)
examples/server/README.md		patch \| blob \| history
examples/server/server.cpp		patch \| blob \| history