server : add system_fingerprint to chat/completion (#10917)

author Xuan Son Nguyen <redacted>

Mon, 23 Dec 2024 11:02:44 +0000 (12:02 +0100)

committer GitHub <redacted>

Mon, 23 Dec 2024 11:02:44 +0000 (12:02 +0100)
author Xuan Son Nguyen <redacted>
Mon, 23 Dec 2024 11:02:44 +0000 (12:02 +0100)
committer GitHub <redacted>
Mon, 23 Dec 2024 11:02:44 +0000 (12:02 +0100)
diff --git a/examples/server/README.md b/examples/server/README.md

index 6d64656926250872ec62b3f15b045190d735713a..5e3d6a6e643a6eb995255c757a8e562d9028faad 100644 (file)
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -724,7 +724,8 @@ This endpoint is public (no API key check). By default, it is read-only. To make
    },
    "total_slots": 1,
    "model_path": "../models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf",
-  "chat_template": "..."
+  "chat_template": "...",
+  "build_info": "b(build number)-(build commit hash)"
  }
  ```
  
diff --git a/examples/server/server.cpp b/examples/server/server.cpp

index fa3682a920649ce980503aa0c3da6aefac6e489d..c571ed3c104d4ed3d477bfd8b6342894cbeddfaf 100644 (file)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -595,10 +595,11 @@ struct server_task_result_cmpl_final : server_task_result {
          std::time_t t = std::time(0);
  
          json res = json {
-            {"choices", json::array({choice})},
-            {"created", t},
-            {"model", oaicompat_model},
-            {"object", "chat.completion"},
+            {"choices",            json::array({choice})},
+            {"created",            t},
+            {"model",              oaicompat_model},
+            {"system_fingerprint", build_info},
+            {"object",             "chat.completion"},
              {"usage", json {
                  {"completion_tokens", n_decoded},
                  {"prompt_tokens",     n_prompt_tokens},
@@ -632,11 +633,12 @@ struct server_task_result_cmpl_final : server_task_result {
          };
  
          json ret = json {
-            {"choices", json::array({choice})},
-            {"created", t},
-            {"id",      oaicompat_cmpl_id},
-            {"model",   oaicompat_model},
-            {"object",  "chat.completion.chunk"},
+            {"choices",            json::array({choice})},
+            {"created",            t},
+            {"id",                 oaicompat_cmpl_id},
+            {"model",              oaicompat_model},
+            {"system_fingerprint", build_info},
+            {"object",             "chat.completion.chunk"},
              {"usage", json {
                  {"completion_tokens", n_decoded},
                  {"prompt_tokens",     n_prompt_tokens},
@@ -761,11 +763,12 @@ struct server_task_result_cmpl_partial : server_task_result {
          }
  
          json ret = json {
-            {"choices", choices},
-            {"created", t},
-            {"id",      oaicompat_cmpl_id},
-            {"model",   oaicompat_model},
-            {"object",  "chat.completion.chunk"}
+            {"choices",            choices},
+            {"created",            t},
+            {"id",                 oaicompat_cmpl_id},
+            {"model",              oaicompat_model},
+            {"system_fingerprint", build_info},
+            {"object",             "chat.completion.chunk"}
          };
  
          if (timings.prompt_n >= 0) {
@@ -3476,6 +3479,7 @@ int main(int argc, char ** argv) {
              { "total_slots",                 ctx_server.params_base.n_parallel },
              { "model_path",                  ctx_server.params_base.model },
              { "chat_template",               llama_get_chat_template(ctx_server.model) },
+            { "build_info",                  build_info },
          };
  
          res_ok(res, data);
diff --git a/examples/server/tests/unit/test_chat_completion.py b/examples/server/tests/unit/test_chat_completion.py

index 0fa1a17c1f50a1ed48b0903662abc0ef672ffd31..88549708113e9a1352ebaa5d47b25fc1a19f8132 100644 (file)
--- a/examples/server/tests/unit/test_chat_completion.py
+++ b/examples/server/tests/unit/test_chat_completion.py
@@ -31,6 +31,7 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte
      })
      assert res.status_code == 200
      assert "cmpl" in res.body["id"] # make sure the completion id has the expected format
+    assert res.body["system_fingerprint"].startswith("b")
      assert res.body["model"] == model if model is not None else server.model_alias
      assert res.body["usage"]["prompt_tokens"] == n_prompt
      assert res.body["usage"]["completion_tokens"] == n_predicted
@@ -63,6 +64,7 @@ def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_conte
      last_cmpl_id = None
      for data in res:
          choice = data["choices"][0]
+        assert data["system_fingerprint"].startswith("b")
          assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future
          if last_cmpl_id is None:
              last_cmpl_id = data["id"]
@@ -92,6 +94,7 @@ def test_chat_completion_with_openai_library():
          seed=42,
          temperature=0.8,
      )
+    assert res.system_fingerprint is not None and res.system_fingerprint.startswith("b")
      assert res.choices[0].finish_reason == "length"
      assert res.choices[0].message.content is not None
      assert match_regex("(Suddenly)+", res.choices[0].message.content)
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp

index 94bb285b6f2d14847f4b36847223e6e712a2d14e..1987acac89159e461872489ab52e1b5df61f0618 100644 (file)
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -56,6 +56,8 @@ static T json_value(const json & body, const std::string & key, const T & defaul
      }
  }
  
+const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
+
  //
  // tokenizer and input processing utils
  //
author	Xuan Son Nguyen <redacted>
	Mon, 23 Dec 2024 11:02:44 +0000 (12:02 +0100)
committer	GitHub <redacted>
	Mon, 23 Dec 2024 11:02:44 +0000 (12:02 +0100)
examples/server/README.md		patch \| blob \| history
examples/server/server.cpp		patch \| blob \| history
examples/server/tests/unit/test_chat_completion.py		patch \| blob \| history
examples/server/utils.hpp		patch \| blob \| history