server : slots monitoring endpoint (#5550)

author Pierrick Hymbert <redacted>

Sun, 18 Feb 2024 17:39:57 +0000 (18:39 +0100)

committer GitHub <redacted>

Sun, 18 Feb 2024 17:39:57 +0000 (19:39 +0200)
author Pierrick Hymbert <redacted>
Sun, 18 Feb 2024 17:39:57 +0000 (18:39 +0100)
committer GitHub <redacted>
Sun, 18 Feb 2024 17:39:57 +0000 (19:39 +0200)
diff --git a/examples/server/README.md b/examples/server/README.md

index 5e3ae833bef57672040733c38af0e8e5a940e6e3..ac5133d2404cb0aeefd2f48b629266b0aacddea9 100644 (file)
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -40,6 +40,7 @@ see https://github.com/ggerganov/llama.cpp/issues/1437
  - `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`
  - `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
  - `-n, --n-predict`: Set the maximum tokens to predict (default: -1)
+- `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
  
  ## Build
  
@@ -381,6 +382,69 @@ Notice that each `probs` is an array of length `n_probs`.
      }'
      ```
  
+- **GET** `/slots`: Returns the current slots processing state. Can be disabled with `--slots-endpoint-disable`.
+
+### Result JSON
+
+```json
+[
+    {
+        "dynatemp_exponent": 1.0,
+        "dynatemp_range": 0.0,
+        "frequency_penalty": 0.0,
+        "grammar": "",
+        "id": 0,
+        "ignore_eos": false,
+        "logit_bias": [],
+        "min_p": 0.05000000074505806,
+        "mirostat": 0,
+        "mirostat_eta": 0.10000000149011612,
+        "mirostat_tau": 5.0,
+        "model": "llama-2-7b-32k-instruct.Q2_K.gguf",
+        "n_ctx": 2048,
+        "n_keep": 0,
+        "n_predict": 100000,
+        "n_probs": 0,
+        "next_token": {
+            "has_next_token": true,
+            "n_remain": -1,
+            "num_tokens_predicted": 0,
+            "stopped_eos": false,
+            "stopped_limit": false,
+            "stopped_word": false,
+            "stopping_word": ""
+        },
+        "penalize_nl": true,
+        "penalty_prompt_tokens": [],
+        "presence_penalty": 0.0,
+        "prompt": "Say hello to llama.cpp",
+        "repeat_last_n": 64,
+        "repeat_penalty": 1.100000023841858,
+        "samplers": [
+            "top_k",
+            "tfs_z",
+            "typical_p",
+            "top_p",
+            "min_p",
+            "temperature"
+        ],
+        "seed": 42,
+        "state": 1,
+        "stop": [
+            "\n"
+        ],
+        "stream": false,
+        "task_id": 0,
+        "temperature": 0.0,
+        "tfs_z": 1.0,
+        "top_k": 40,
+        "top_p": 0.949999988079071,
+        "typical_p": 1.0,
+        "use_penalty_prompt_tokens": false
+    }
+]
+```
+
  ## More examples
  
  ### Change system prompt on runtime
diff --git a/examples/server/server.cpp b/examples/server/server.cpp

index 8145af867292b3841b2eba1639eed7dd2c64d4f6..4f2e9c898b3911f5c60188247e88c6bcd3345a28 100644 (file)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -41,6 +41,7 @@ struct server_params
      int32_t port = 8080;
      int32_t read_timeout = 600;
      int32_t write_timeout = 600;
+    bool slots_endpoint = true;
  };
  
  bool server_verbose = false;
@@ -1926,6 +1927,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
      printf("                            set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
      printf("  --mmproj MMPROJ_FILE      path to a multimodal projector file for LLaVA.\n");
      printf("  --log-disable             disables logging to a file.\n");
+    printf("  --slots-endpoint-disable  disables slots monitoring endpoint.\n");
      printf("\n");
      printf("  -n, --n-predict           maximum tokens to predict (default: %d)\n", params.n_predict);
      printf("  --override-kv KEY=TYPE:VALUE\n");
@@ -2374,6 +2376,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
              log_set_target(stdout);
              LOG_INFO("logging to file is disabled.", {});
          }
+        else if (arg == "--slots-endpoint-disable")
+        {
+            sparams.slots_endpoint = false;
+        }
          else if (arg == "--chat-template")
          {
              if (++i >= argc)
@@ -2619,6 +2625,32 @@ int main(int argc, char **argv)
          }
      });
  
+    if (sparams.slots_endpoint) {
+        svr.Get("/slots", [&](const httplib::Request&, httplib::Response& res) {
+            json slots;
+            for (llama_client_slot & slot : llama.slots) {
+                json slot_data = llama.get_formated_generation(slot);
+                slot_data["id"] = slot.id;
+                slot_data["task_id"] = slot.task_id;
+                slot_data["state"] = slot.state;
+                slot_data["prompt"] = slot.prompt;
+                slot_data["next_token"] = {
+                        {"has_next_token", slot.has_next_token},
+                        {"n_remain", slot.n_remaining},
+                        {"num_tokens_predicted", slot.n_decoded},
+                        {"stopped_eos", slot.stopped_eos},
+                        {"stopped_word", slot.stopped_word},
+                        {"stopped_limit", slot.stopped_limit},
+                        {"stopping_word", slot.stopping_word},
+                };
+
+                slots.push_back(slot_data);
+            }
+            res.set_content(slots.dump(), "application/json");
+            res.status = 200; // HTTP OK
+        });
+    }
+
      svr.set_logger(log_server_request);
  
      svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep)
author	Pierrick Hymbert <redacted>
	Sun, 18 Feb 2024 17:39:57 +0000 (18:39 +0100)
committer	GitHub <redacted>
	Sun, 18 Feb 2024 17:39:57 +0000 (19:39 +0200)
examples/server/README.md		patch \| blob \| history
examples/server/server.cpp		patch \| blob \| history