server : health endpoint configurable failure on no slot (#5594)

author Pierrick Hymbert <redacted>

Tue, 20 Feb 2024 07:48:19 +0000 (08:48 +0100)

committer GitHub <redacted>

Tue, 20 Feb 2024 07:48:19 +0000 (09:48 +0200)
author Pierrick Hymbert <redacted>
Tue, 20 Feb 2024 07:48:19 +0000 (08:48 +0100)
committer GitHub <redacted>
Tue, 20 Feb 2024 07:48:19 +0000 (09:48 +0200)
diff --git a/examples/server/README.md b/examples/server/README.md

index 809e2d37c8732a8667692f06c6a59d95c1130555..f6b9c74021f302a55fd330703947825ee000e1a2 100644 (file)
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -134,10 +134,11 @@ node index.js
  ## API Endpoints
  
  - **GET** `/health`: Returns the current state of the server:
-  - `{"status": "loading model"}` if the model is still being loaded.
-  - `{"status": "error"}` if the model failed to load.
-  - `{"status": "ok"}` if the model is successfully loaded and the server is ready for further requests mentioned below.
-  - `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available
+  - 503 -> `{"status": "loading model"}` if the model is still being loaded.
+  - 500 -> `{"status": "error"}` if the model failed to load.
+  - 200 -> `{"status": "ok", "slots_idle": 1, "slots_processing": 2 }` if the model is successfully loaded and the server is ready for further requests mentioned below.
+  - 200 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available.
+  - 503 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if the query parameter `fail_on_no_slot` is provided and no slot are currently available.
  
  - **POST** `/completion`: Given a `prompt`, it returns the predicted completion.
  
diff --git a/examples/server/server.cpp b/examples/server/server.cpp

index 22c344dd45544dd9036743fce98eb2e588ecbd2f..23482ed95075acd6d0bc2d64779b85d8cf5c2c6e 100644 (file)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2582,40 +2582,40 @@ int main(int argc, char **argv)
          res.set_header("Access-Control-Allow-Headers", "*");
      });
  
-    svr.Get("/health", [&](const httplib::Request&, httplib::Response& res) {
+    svr.Get("/health", [&](const httplib::Request& req, httplib::Response& res) {
          server_state current_state = state.load();
          switch(current_state) {
-            case SERVER_STATE_READY:
-                if (llama.all_slots_are_idle) {
-                    res.set_content(R"({"status": "ok"})", "application/json");
+            case SERVER_STATE_READY: {
+                int available_slots  = 0;
+                int processing_slots = 0;
+                for (llama_client_slot &slot: llama.slots) {
+                    if (slot.available()) {
+                        available_slots++;
+                    } else {
+                        processing_slots++;
+                    }
+                }
+                if (available_slots > 0) {
+                    json health = {
+                            {"status",           "ok"},
+                            {"slots_idle",       available_slots},
+                            {"slots_processing", processing_slots}};
+                    res.set_content(health.dump(), "application/json");
                      res.status = 200; // HTTP OK
                  } else {
-                    int available_slots = 0;
-                    int processing_slots = 0;
-                    for (llama_client_slot & slot : llama.slots) {
-                        if (slot.available()) {
-                            available_slots++;
-                        } else {
-                            processing_slots++;
-                        }
-                    }
-                    if (available_slots > 0) {
-                        json health = {
-                                {"status",           "ok"},
-                                {"slots_idle",       available_slots},
-                                {"slots_processing", processing_slots}};
-                        res.set_content(health.dump(), "application/json");
-                        res.status = 200; // HTTP OK
-                    } else {
-                        json health = {
-                                {"status",           "no slot available"},
-                                {"slots_idle",       available_slots},
-                                {"slots_processing", processing_slots}};
-                        res.set_content(health.dump(), "application/json");
+                    json health = {
+                            {"status",           "no slot available"},
+                            {"slots_idle",       available_slots},
+                            {"slots_processing", processing_slots}};
+                    res.set_content(health.dump(), "application/json");
+                    if (req.has_param("fail_on_no_slot")) {
                          res.status = 503; // HTTP Service Unavailable
+                    } else {
+                        res.status = 200; // HTTP OK
                      }
                  }
                  break;
+            }
              case SERVER_STATE_LOADING_MODEL:
                  res.set_content(R"({"status": "loading model"})", "application/json");
                  res.status = 503; // HTTP Service Unavailable
author	Pierrick Hymbert <redacted>
	Tue, 20 Feb 2024 07:48:19 +0000 (08:48 +0100)
committer	GitHub <redacted>
	Tue, 20 Feb 2024 07:48:19 +0000 (09:48 +0200)
examples/server/README.md		patch \| blob \| history
examples/server/server.cpp		patch \| blob \| history