server : add /apply-template endpoint for additional use cases of Minja functionality...

author Nigel Bosch <redacted>

Wed, 29 Jan 2025 18:45:44 +0000 (12:45 -0600)

committer GitHub <redacted>

Wed, 29 Jan 2025 18:45:44 +0000 (19:45 +0100)
author Nigel Bosch <redacted>
Wed, 29 Jan 2025 18:45:44 +0000 (12:45 -0600)
committer GitHub <redacted>
Wed, 29 Jan 2025 18:45:44 +0000 (19:45 +0100)
diff --git a/examples/server/README.md b/examples/server/README.md

index e788d8b59bc56c840c168d068d477e942518e17e..cedae0b6dfb471228fcbca209f950afae29bcfe1 100644 (file)
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -576,6 +576,14 @@ With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k
  
  `tokens`: Set the tokens to detokenize.
  
+### POST `/apply-template`: Apply chat template to a conversation
+
+Uses the server's prompt template formatting functionality to convert chat messages to a single string expected by a chat model as input, but does not perform inference. Instead, the prompt string is returned in the `prompt` field of the JSON response. The prompt can then be modified as desired (for example, to insert "Sure!" at the beginning of the model's response) before sending to `/completion` to generate the chat response.
+
+*Options:*
+
+`messages`: (Required) Chat turns in the same format as `/v1/chat/completions`.
+
  ### POST `/embedding`: Generate embedding of a given text
  
  > [!IMPORTANT]
diff --git a/examples/server/server.cpp b/examples/server/server.cpp

index c5efbdb09ac404ad1659e6f7560bfd48745c7682..6e28d283a46e652a33ade6014395721ef42f2724 100644 (file)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -4124,6 +4124,14 @@ int main(int argc, char ** argv) {
          res_ok(res, root);
      };
  
+    const auto handle_apply_template = [&ctx_server, &params, &res_ok](const httplib::Request & req, httplib::Response & res) {
+        auto body = json::parse(req.body);
+        const auto & chat_template = body.contains("tools") && ctx_server.chat_templates.template_tool_use ? *ctx_server.chat_templates.template_tool_use : *ctx_server.chat_templates.template_default;
+        json data = oaicompat_completion_params_parse(body, chat_template, params.use_jinja);
+
+        res_ok(res, {{ "prompt", data.at("prompt") }});
+    };
+
      const auto handle_embeddings = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) {
          handle_embeddings_impl(req, res, OAICOMPAT_TYPE_NONE);
      };
@@ -4300,6 +4308,7 @@ int main(int argc, char ** argv) {
      svr->Post("/v1/reranking",        handle_rerank);
      svr->Post("/tokenize",            handle_tokenize);
      svr->Post("/detokenize",          handle_detokenize);
+    svr->Post("/apply-template",      handle_apply_template);
      // LoRA adapters hotswap
      svr->Get ("/lora-adapters",       handle_lora_adapters_list);
      svr->Post("/lora-adapters",       handle_lora_adapters_apply);
diff --git a/examples/server/tests/unit/test_chat_completion.py b/examples/server/tests/unit/test_chat_completion.py

index 2e15348dceecbbcb31538c71830c1a77b020f469..add3f810f5e9909671bee535efc791ab2d38bc5e 100644 (file)
--- a/examples/server/tests/unit/test_chat_completion.py
+++ b/examples/server/tests/unit/test_chat_completion.py
@@ -121,6 +121,21 @@ def test_chat_template():
      assert res.body["__verbose"]["prompt"] == "<s> <|start_header_id|>system<|end_header_id|>\n\nBook<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is the best book<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
  
  
+def test_apply_chat_template():
+    global server
+    server.chat_template = "command-r"
+    server.start()
+    res = server.make_request("POST", "/apply-template", data={
+        "messages": [
+            {"role": "system", "content": "You are a test."},
+            {"role": "user", "content":"Hi there"},
+        ]
+    })
+    assert res.status_code == 200
+    assert "prompt" in res.body
+    assert res.body["prompt"] == "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>You are a test.<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hi there<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
+
+
  @pytest.mark.parametrize("response_format,n_predicted,re_content", [
      ({"type": "json_object", "schema": {"const": "42"}}, 6, "\"42\""),
      ({"type": "json_object", "schema": {"items": [{"type": "integer"}]}}, 10, "[ -3000 ]"),
author	Nigel Bosch <redacted>
	Wed, 29 Jan 2025 18:45:44 +0000 (12:45 -0600)
committer	GitHub <redacted>
	Wed, 29 Jan 2025 18:45:44 +0000 (19:45 +0100)
examples/server/README.md		patch \| blob \| history
examples/server/server.cpp		patch \| blob \| history
examples/server/tests/unit/test_chat_completion.py		patch \| blob \| history