server : added --no-prefill-assistant flag (#13608)

author Isaac McFadyen <redacted>

Sat, 17 May 2025 21:59:48 +0000 (17:59 -0400)

committer GitHub <redacted>

Sat, 17 May 2025 21:59:48 +0000 (23:59 +0200)
author Isaac McFadyen <redacted>
Sat, 17 May 2025 21:59:48 +0000 (17:59 -0400)
committer GitHub <redacted>
Sat, 17 May 2025 21:59:48 +0000 (23:59 +0200)
diff --git a/common/arg.cpp b/common/arg.cpp

index 8aa72515d1042f9330093f965fd284c08ef03132..305168043c27c2ce68776c292fe8a065dce34c36 100644 (file)
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2880,6 +2880,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
              params.chat_template = read_file(value);
          }
      ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
+    add_opt(common_arg(
+        {"--no-prefill-assistant"},
+        string_format(
+            "whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
+            "when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"
+        ),
+        [](common_params & params) {
+            params.prefill_assistant = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_PREFILL_ASSISTANT"));
      add_opt(common_arg(
          {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
          string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
diff --git a/common/common.h b/common/common.h

index a99a36029a53c2e0cad86be38421076118443fce..da525dd420b7d0a756a3e6b25d506faf7dee6e5f 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -368,6 +368,7 @@ struct common_params {
      bool use_jinja = false;                                                                                 // NOLINT
      bool enable_chat_template = true;
      common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+    bool prefill_assistant = true;                                                                          // if true, any trailing assistant message will be prefilled into the response
  
      std::vector<std::string> api_keys;
  
diff --git a/tools/server/README.md b/tools/server/README.md

index 17ad93df61f87fa125a9eade12df66e7ab53e892..0b84966ae86d7467f9acdaaa676d3564d4d2203a 100644 (file)
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -13,6 +13,7 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
   * Multimodal ([documentation](../../docs/multimodal.md)) / with OpenAI-compatible API support
   * Monitoring endpoints
   * Schema-constrained JSON response format
+ * Prefilling of assistant messages similar to the Claude API
   * [Function calling](../../docs/function-calling.md) / tool use for ~any model
   * Speculative decoding
   * Easy-to-use web UI
@@ -175,6 +176,7 @@ The project is under active development, and we are [looking for feedback and co
  | `--reasoning-format FORMAT` | reasoning format (default: deepseek; allowed values: deepseek, none)<br/>controls whether thought tags are extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only).<br/>only supported for non-streamed responses<br/>(env: LLAMA_ARG_THINK) |
  | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, falcon3, gemma, gigachat, glmedge, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
  | `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, falcon3, gemma, gigachat, glmedge, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
+| `--no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/>(env: LLAMA_ARG_NO_PREFILL_ASSISTANT) |
  | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> |
  | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
  | `--draft-max, --draft, --draft-n N` | number of tokens to draft for speculative decoding (default: 16)<br/>(env: LLAMA_ARG_DRAFT_MAX) |
diff --git a/tools/server/server.cpp b/tools/server/server.cpp

index 129d013ac75f7861f7babe50977f1c7ecea96041..348588a2cb22403f90ed772f2f04debdfc5c4f0a 100644 (file)
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -4348,6 +4348,7 @@ int main(int argc, char ** argv) {
          json data = oaicompat_completion_params_parse(
              body,
              params.use_jinja,
+            params.prefill_assistant,
              params.reasoning_format,
              ctx_server.chat_templates.get(),
              ctx_server.mctx,
@@ -4369,6 +4370,7 @@ int main(int argc, char ** argv) {
          json data = oaicompat_completion_params_parse(
              body,
              params.use_jinja,
+            params.prefill_assistant,
              params.reasoning_format,
              ctx_server.chat_templates.get(),
              ctx_server.mctx,
diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp

index 232eef195437f4f9d4968e48e2565c632c63fbd6..3e7733539fe0ece714757f2b6468db2b9eb45c8f 100644 (file)
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@@ -583,6 +583,7 @@ static json oaicompat_completion_params_parse(const json & body) {
  static json oaicompat_completion_params_parse(
      const json & body, /* openai api json semantics */
      bool use_jinja,
+    bool prefill_assistant,
      common_reasoning_format reasoning_format,
      const struct common_chat_templates * tmpls,
      bool allow_non_text,
@@ -732,7 +733,7 @@ static json oaicompat_completion_params_parse(
  
      // if the assistant message appears at the end of list, we do not add end-of-turn token
      // for ex. this can be useful to modify the reasoning process in reasoning models
-    bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant";
+    bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && prefill_assistant;
      common_chat_msg last_message;
      if (prefill_assistant_message) {
          last_message = inputs.messages.back();
author	Isaac McFadyen <redacted>
	Sat, 17 May 2025 21:59:48 +0000 (17:59 -0400)
committer	GitHub <redacted>
	Sat, 17 May 2025 21:59:48 +0000 (23:59 +0200)
common/arg.cpp		patch \| blob \| history
common/common.h		patch \| blob \| history
tools/server/README.md		patch \| blob \| history
tools/server/server.cpp		patch \| blob \| history
tools/server/utils.hpp		patch \| blob \| history