server : various fixes (#10704)

author Georgi Gerganov <redacted>

Sat, 7 Dec 2024 16:02:05 +0000 (18:02 +0200)

committer GitHub <redacted>

Sat, 7 Dec 2024 16:02:05 +0000 (18:02 +0200)
author Georgi Gerganov <redacted>
Sat, 7 Dec 2024 16:02:05 +0000 (18:02 +0200)
committer GitHub <redacted>
Sat, 7 Dec 2024 16:02:05 +0000 (18:02 +0200)
diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt

index 0035859a6732edd5a991500aecee296fc19f696d..63fca1d590f5270e4e86fabb8e32a4da6ffc8752 100644 (file)
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -34,14 +34,6 @@ endforeach()
  add_executable(${TARGET} ${TARGET_SRCS})
  install(TARGETS ${TARGET} RUNTIME)
  
-# clean up generated files in pre-build step
-foreach(asset ${PUBLIC_ASSETS})
-    set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp")
-    add_custom_command(TARGET ${TARGET} PRE_BUILD
-        COMMAND "${CMAKE_COMMAND}" -E remove -f "${output}"
-    )
-endforeach()
-
  target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
  
  if (LLAMA_SERVER_SSL)
diff --git a/examples/server/README.md b/examples/server/README.md

index 8dbed2626a4444b09639e8e165910ecd4d5655a2..0bab40a82250caf199d6d81dc545e97b2ba7b318 100644 (file)
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -618,9 +618,76 @@ This endpoint is public (no API key check). By default, it is read-only. To make
  
  ```json
  {
-  "default_generation_settings": { ... },
+  "default_generation_settings": {
+    "id": 0,
+    "id_task": -1,
+    "n_ctx": 1024,
+    "speculative": false,
+    "is_processing": false,
+    "params": {
+      "n_predict": -1,
+      "seed": 4294967295,
+      "temperature": 0.800000011920929,
+      "dynatemp_range": 0.0,
+      "dynatemp_exponent": 1.0,
+      "top_k": 40,
+      "top_p": 0.949999988079071,
+      "min_p": 0.05000000074505806,
+      "xtc_probability": 0.0,
+      "xtc_threshold": 0.10000000149011612,
+      "typical_p": 1.0,
+      "repeat_last_n": 64,
+      "repeat_penalty": 1.0,
+      "presence_penalty": 0.0,
+      "frequency_penalty": 0.0,
+      "dry_multiplier": 0.0,
+      "dry_base": 1.75,
+      "dry_allowed_length": 2,
+      "dry_penalty_last_n": -1,
+      "dry_sequence_breakers": [
+        "\n",
+        ":",
+        "\"",
+        "*"
+      ],
+      "mirostat": 0,
+      "mirostat_tau": 5.0,
+      "mirostat_eta": 0.10000000149011612,
+      "penalize_nl": false,
+      "stop": [],
+      "max_tokens": -1,
+      "n_keep": 0,
+      "n_discard": 0,
+      "ignore_eos": false,
+      "stream": true,
+      "n_probs": 0,
+      "min_keep": 0,
+      "grammar": "",
+      "samplers": [
+        "dry",
+        "top_k",
+        "typ_p",
+        "top_p",
+        "min_p",
+        "xtc",
+        "temperature"
+      ],
+      "speculative.n_max": 16,
+      "speculative.n_min": 5,
+      "speculative.p_min": 0.8999999761581421,
+      "timings_per_token": false
+    },
+    "prompt": "",
+    "next_token": {
+      "has_next_token": true,
+      "has_new_line": false,
+      "n_remain": -1,
+      "n_decoded": 0,
+      "stopping_word": ""
+    }
+  },
    "total_slots": 1,
-  "chat_template": ""
+  "chat_template": "..."
  }
  ```
  
@@ -739,56 +806,74 @@ Example:
  
  ```json
  [
-    {
-        "dynatemp_exponent": 1.0,
-        "dynatemp_range": 0.0,
-        "frequency_penalty": 0.0,
-        "grammar": "",
-        "id": 0,
-        "ignore_eos": false,
-        "is_processing": false,
-        "logit_bias": [],
-        "min_p": 0.05000000074505806,
-        "mirostat": 0,
-        "mirostat_eta": 0.10000000149011612,
-        "mirostat_tau": 5.0,
-        "model": "llama-2-7b-32k-instruct.Q2_K.gguf",
-        "n_ctx": 2048,
-        "n_keep": 0,
-        "n_predict": 100000,
-        "n_probs": 0,
-        "next_token": {
-            "has_next_token": true,
-            "n_remain": -1,
-            "n_decoded": 0,
-            "stopped_eos": false,
-            "stopped_limit": false,
-            "stopped_word": false,
-            "stopping_word": ""
-        },
-        "penalize_nl": true,
-        "presence_penalty": 0.0,
-        "prompt": "Say hello to llama.cpp",
-        "repeat_last_n": 64,
-        "repeat_penalty": 1.100000023841858,
-        "samplers": [
-            "top_k",
-            "typical_p",
-            "top_p",
-            "min_p",
-            "temperature"
-        ],
-        "seed": 42,
-        "stop": [
-            "\n"
-        ],
-        "stream": false,
-        "task_id": 0,
-        "temperature": 0.0,
-        "top_k": 40,
-        "top_p": 0.949999988079071,
-        "typical_p": 1.0
+  {
+    "id": 0,
+    "id_task": -1,
+    "n_ctx": 1024,
+    "speculative": false,
+    "is_processing": false,
+    "params": {
+      "n_predict": -1,
+      "seed": 4294967295,
+      "temperature": 0.800000011920929,
+      "dynatemp_range": 0.0,
+      "dynatemp_exponent": 1.0,
+      "top_k": 40,
+      "top_p": 0.949999988079071,
+      "min_p": 0.05000000074505806,
+      "xtc_probability": 0.0,
+      "xtc_threshold": 0.10000000149011612,
+      "typical_p": 1.0,
+      "repeat_last_n": 64,
+      "repeat_penalty": 1.0,
+      "presence_penalty": 0.0,
+      "frequency_penalty": 0.0,
+      "dry_multiplier": 0.0,
+      "dry_base": 1.75,
+      "dry_allowed_length": 2,
+      "dry_penalty_last_n": -1,
+      "dry_sequence_breakers": [
+        "\n",
+        ":",
+        "\"",
+        "*"
+      ],
+      "mirostat": 0,
+      "mirostat_tau": 5.0,
+      "mirostat_eta": 0.10000000149011612,
+      "penalize_nl": false,
+      "stop": [],
+      "max_tokens": -1,
+      "n_keep": 0,
+      "n_discard": 0,
+      "ignore_eos": false,
+      "stream": true,
+      "n_probs": 0,
+      "min_keep": 0,
+      "grammar": "",
+      "samplers": [
+        "dry",
+        "top_k",
+        "typ_p",
+        "top_p",
+        "min_p",
+        "xtc",
+        "temperature"
+      ],
+      "speculative.n_max": 16,
+      "speculative.n_min": 5,
+      "speculative.p_min": 0.8999999761581421,
+      "timings_per_token": false
+    },
+    "prompt": "",
+    "next_token": {
+      "has_next_token": true,
+      "has_new_line": false,
+      "n_remain": -1,
+      "n_decoded": 0,
+      "stopping_word": ""
      }
+  }
  ]
  ```
  
diff --git a/examples/server/server.cpp b/examples/server/server.cpp

index d57a296a2de07e9a42e98189ebff428ffe336028..1ce8fbae23d5c8732f2e03ca70c3cd93d8bf6bd1 100644 (file)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -122,11 +122,6 @@ struct slot_params {
      struct common_params_sampling sampling;
      struct common_params_speculative speculative;
  
-    // params only used in to_json()
-    int32_t n_ctx;
-    uint32_t seed_cur;
-    bool can_speculative;
-
      // OAI-compat fields
      bool        verbose        = false;
      bool        oaicompat      = false;
@@ -134,7 +129,7 @@ struct slot_params {
      std::string oaicompat_model;
      std::string oaicompat_cmpl_id;
  
-    json to_json() {
+    json to_json() const {
          std::vector<std::string> samplers;
          samplers.reserve(sampling.samplers.size());
          for (const auto & sampler : sampling.samplers) {
@@ -142,8 +137,8 @@ struct slot_params {
          }
  
          return json {
-            {"n_ctx",                     n_ctx},
              {"n_predict",                 n_predict},     // Server configured n_predict
+            {"seed",                      sampling.seed},
              {"temperature",               sampling.temp},
              {"dynatemp_range",            sampling.dynatemp_range},
              {"dynatemp_exponent",         sampling.dynatemp_exponent},
@@ -177,7 +172,6 @@ struct slot_params {
              {"min_keep",                  sampling.min_keep},
              {"grammar",                   sampling.grammar},
              {"samplers",                  samplers},
-            {"speculative",               can_speculative},
              {"speculative.n_max",         speculative.n_max},
              {"speculative.n_min",         speculative.n_min},
              {"speculative.p_min",         speculative.p_min},
@@ -483,12 +477,6 @@ struct server_task_result_cmpl_partial : server_task_result {
                      return std::vector<json>({initial_ret, second_ret});
                  }
              } else {
-                // Some idiosyncrasy in task processing logic makes several trailing calls
-                // with empty content, we ignore these at the calee site.
-                if (content.empty()) {
-                    return std::vector<json>({json::object()});
-                }
-
                  choices = json::array({json{
                      {"finish_reason", nullptr},
                      {"index", 0},
@@ -722,6 +710,7 @@ struct server_slot {
  
      llama_batch batch_spec = {};
  
+    llama_context * ctx = nullptr;
      llama_context * ctx_dft = nullptr;
  
      common_speculative * spec = nullptr;
@@ -906,6 +895,27 @@ struct server_slot {
                  t_token_generation, n_decoded, t_gen, n_gen_second,
                  t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded);
      }
+
+    json to_json() const {
+        return json {
+            {"id",            id},
+            {"id_task",       id_task},
+            {"n_ctx",         n_ctx},
+            {"speculative",   can_speculate()},
+            {"is_processing", is_processing()},
+            {"params",        params.to_json()},
+            {"prompt",        common_detokenize(ctx, prompt_tokens)},
+            {"next_token",
+                {
+                    {"has_next_token", has_next_token},
+                    {"has_new_line",   has_new_line},
+                    {"n_remain",       n_remaining},
+                    {"n_decoded",      n_decoded},
+                    {"stopping_word",  stopping_word},
+                }
+            },
+        };
+    }
  };
  
  struct server_metrics {
@@ -1338,6 +1348,7 @@ struct server_context {
              server_slot slot;
  
              slot.id = i;
+            slot.ctx = ctx;
              slot.n_ctx = n_ctx_slot;
              slot.n_predict = params_base.n_predict;
  
@@ -1370,8 +1381,7 @@ struct server_context {
              slots.push_back(slot);
          }
  
-        default_generation_settings_for_props = slots[0].params.to_json();
-        default_generation_settings_for_props["seed"] = -1;
+        default_generation_settings_for_props = slots[0].to_json();
  
          // the update_slots() logic will always submit a maximum of n_batch or n_parallel tokens
          // note that n_batch can be > n_ctx (e.g. for non-causal attention models such as BERT where the KV cache is not used)
@@ -1848,17 +1858,18 @@ struct server_context {
          queue_results.send(std::move(res));
      }
  
-    void send_partial_response(server_slot & slot, completion_token_output tkn) {
+    void send_partial_response(server_slot & slot, const completion_token_output & tkn) {
          auto res = std::make_unique<server_task_result_cmpl_partial>();
-        res->id              = slot.id_task;
-        res->index           = slot.index;
-        res->content         = tkn.text_to_send;
+
+        res->id      = slot.id_task;
+        res->index   = slot.index;
+        res->content = tkn.text_to_send;
  
          res->truncated       = slot.truncated;
          res->n_decoded       = slot.n_decoded;
          res->n_prompt_tokens = slot.n_prompt_tokens;
  
-        res->stop            = slot.stop;
+        res->stop = slot.stop;
  
          res->verbose           = slot.params.verbose;
          res->oaicompat         = slot.params.oaicompat;
@@ -1869,6 +1880,7 @@ struct server_context {
          // populate res.probs_output
          if (slot.params.sampling.n_probs > 0) {
              const llama_tokens to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);
+
              const size_t probs_pos      = std::min(slot.n_sent_token_probs,                       slot.generated_token_probs.size());
              const size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size());
  
@@ -1891,7 +1903,8 @@ struct server_context {
      void send_final_response(server_slot & slot) {
          if (slot.params.stream) {
              // if in stream mode, send the last partial response
-            return send_partial_response(slot, {0, "", {}});
+            send_partial_response(slot, {0, "", {}});
+            return;
          }
  
          auto res = std::make_unique<server_task_result_cmpl_final>();
@@ -2012,6 +2025,7 @@ struct server_context {
          std::vector<server_task> tasks;
          auto create_task = [&](json & task_data, llama_tokens & prompt_tokens) {
              SRV_DBG("create task, n_tokens = %d\n", (int) prompt_tokens.size());
+
              server_task task;
              task.id            = queue_tasks.get_new_id();
              task.inf_type      = inf_type;
@@ -2205,18 +2219,7 @@ struct server_context {
                      int n_processing_slots = 0;
  
                      for (server_slot & slot : slots) {
-                        json slot_data = slot.params.to_json();
-                        slot_data["id"]            = slot.id;
-                        slot_data["id_task"]       = slot.id_task;
-                        slot_data["is_processing"] = slot.is_processing();
-                        slot_data["prompt"]        = common_detokenize(ctx, slot.prompt_tokens);
-                        slot_data["next_token"]    = {
-                            {"has_next_token", slot.has_next_token},
-                            {"has_new_line",   slot.has_new_line},
-                            {"n_remain",       slot.n_remaining},
-                            {"n_decoded",      slot.n_decoded},
-                            {"stopping_word",  slot.stopping_word},
-                        };
+                        json slot_data = slot.to_json();
  
                          if (slot.is_processing()) {
                              n_processing_slots++;
@@ -2230,6 +2233,7 @@ struct server_context {
  
                      auto res = std::make_unique<server_task_result_metrics>();
                      res->id                  = task.id;
+                    res->slots_data          = std::move(slots_data);
                      res->n_idle_slots        = n_idle_slots;
                      res->n_processing_slots  = n_processing_slots;
                      res->n_tasks_deferred    = queue_tasks.queue_tasks_deferred.size();
@@ -3003,11 +3007,11 @@ int main(int argc, char ** argv) {
          res.status = 200;
      };
  
-    svr->set_exception_handler([&res_error](const httplib::Request &, httplib::Response & res, std::exception_ptr ep) {
+    svr->set_exception_handler([&res_error](const httplib::Request &, httplib::Response & res, const std::exception_ptr & ep) {
          std::string message;
          try {
              std::rethrow_exception(ep);
-        } catch (std::exception & e) {
+        } catch (const std::exception & e) {
              message = e.what();
          } catch (...) {
              message = "Unknown Exception";
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp

index a96116ac36caaf26651abac34f2919721bdf01c5..c9fe7d966b2095e320e2d864aa6659a907a30c96 100644 (file)
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -327,12 +327,12 @@ static std::string llama_get_chat_template(const struct llama_model * model) {
      std::string template_key = "tokenizer.chat_template";
      // call with NULL buffer to get the total size of the string
      int32_t res = llama_model_meta_val_str(model, template_key.c_str(), NULL, 0);
-    if (res < 0) {
+    if (res < 2) {
          return "";
      } else {
          std::vector<char> model_template(res, 0);
          llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
-        return std::string(model_template.data(), model_template.size());
+        return std::string(model_template.data(), model_template.size() - 1);
      }
  }
author	Georgi Gerganov <redacted>
	Sat, 7 Dec 2024 16:02:05 +0000 (18:02 +0200)
committer	GitHub <redacted>
	Sat, 7 Dec 2024 16:02:05 +0000 (18:02 +0200)
examples/server/CMakeLists.txt		patch \| blob \| history
examples/server/README.md		patch \| blob \| history
examples/server/server.cpp		patch \| blob \| history
examples/server/utils.hpp		patch \| blob \| history