Thinking model disabled assistant prefill (#15404)

author Gabe Goodhart <redacted>

Fri, 5 Sep 2025 20:31:24 +0000 (14:31 -0600)

committer GitHub <redacted>

Fri, 5 Sep 2025 20:31:24 +0000 (14:31 -0600)
author Gabe Goodhart <redacted>
Fri, 5 Sep 2025 20:31:24 +0000 (14:31 -0600)
committer GitHub <redacted>
Fri, 5 Sep 2025 20:31:24 +0000 (14:31 -0600)
diff --git a/common/chat.cpp b/common/chat.cpp

index e962adfdc2bfc32513a6347847adcb2fff958cea..a8a4c3e3c12b67e05cfbf7d8cf7c57a3e42e908e 100644 (file)
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -163,6 +163,19 @@ common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::strin
      throw std::runtime_error("Invalid tool_choice: " + tool_choice);
  }
  
+bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates) {
+    common_chat_templates_inputs dummy_inputs;
+    common_chat_msg msg;
+    msg.role = "user";
+    msg.content = "test";
+    dummy_inputs.messages = {msg};
+    dummy_inputs.enable_thinking = false;
+    const auto rendered_no_thinking = common_chat_templates_apply(chat_templates, dummy_inputs);
+    dummy_inputs.enable_thinking = true;
+    const auto rendered_with_thinking = common_chat_templates_apply(chat_templates, dummy_inputs);
+    return rendered_no_thinking.prompt != rendered_with_thinking.prompt;
+}
+
  template <>
  std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messages) {
      std::vector<common_chat_msg> msgs;
diff --git a/common/chat.h b/common/chat.h

index ccd26f27fca3e565a388320cc3df038d7dac5da6..41851022d9484f9c02d4a6f85a08e4eca59c8fd6 100644 (file)
--- a/common/chat.h
+++ b/common/chat.h
@@ -199,6 +199,8 @@ common_chat_msg           common_chat_parse(const std::string & input, bool is_p
  
  common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
  
+bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates);
+
  // Parses a JSON array of messages in OpenAI's chat completion API format.
  // T can be std::string containing JSON or nlohmann::ordered_json
  template <class T> std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const T & messages);
diff --git a/tools/server/server.cpp b/tools/server/server.cpp

index 44487eca593373e06a044737b060391d73cde30a..de7c9931177cfdc8b7964bdbcb8edfbd5ecf299c 100644 (file)
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -2267,6 +2267,12 @@ struct server_context {
  
          metrics.init();
  
+        // thinking is enabled if:
+        // 1. It's not explicitly disabled (reasoning_budget == 0)
+        // 2. The chat template supports it
+        const bool enable_thinking = params_base.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get());
+        SRV_INF("Enable thinking? %d\n", enable_thinking);
+
          oai_parser_opt = {
              /* use_jinja             */ params_base.use_jinja,
              /* prefill_assistant     */ params_base.prefill_assistant,
@@ -2275,7 +2281,7 @@ struct server_context {
              /* common_chat_templates */ chat_templates.get(),
              /* allow_image           */ mctx ? mtmd_support_vision(mctx) : false,
              /* allow_audio           */ mctx ? mtmd_support_audio (mctx) : false,
-            /* enable_thinking       */ params_base.reasoning_budget != 0,
+            /* enable_thinking       */ enable_thinking,
          };
      }
  
diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp

index 036060bb3e9c7ada3a1d7fb75c553507d0860ecb..8636cf511d636e469fdc279e343ef9df11347b6c 100644 (file)
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@@ -54,8 +54,8 @@ static T json_value(const json & body, const std::string & key, const T & defaul
      if (body.contains(key) && !body.at(key).is_null()) {
          try {
              return body.at(key);
-        } catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const &) {
-            LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value\n", key.c_str(), json(default_value).type_name());
+        } catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const & err) {
+            LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value: %s\n", key.c_str(), json(default_value).type_name(), err.what());
              return default_value;
          }
      } else {
@@ -708,6 +708,16 @@ static json oaicompat_chat_params_parse(
          inputs.chat_template_kwargs[item.key()] = item.value().dump();
      }
  
+    // parse the "enable_thinking" kwarg to override the default value
+    auto enable_thinking_kwarg = json_value(inputs.chat_template_kwargs, "enable_thinking", std::string(""));
+    if (enable_thinking_kwarg == "true") {
+        inputs.enable_thinking = true;
+    } else if (enable_thinking_kwarg == "false") {
+        inputs.enable_thinking = false;
+    } else if (!enable_thinking_kwarg.empty() && enable_thinking_kwarg[0] == '"') {
+        throw std::runtime_error("invalid type for \"enable_thinking\" (expected boolean, got string)");
+    }
+
      // if the assistant message appears at the end of list, we do not add end-of-turn token
      // for ex. this can be useful to modify the reasoning process in reasoning models
      bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && opt.prefill_assistant;
@@ -724,7 +734,7 @@ static json oaicompat_chat_params_parse(
          /* TODO: test this properly */
          inputs.reasoning_format = COMMON_REASONING_FORMAT_NONE;
  
-        if ( (!inputs.enable_thinking) || inputs.chat_template_kwargs.find("enable_thinking") != inputs.chat_template_kwargs.end()) {
+        if ( inputs.enable_thinking ) {
              throw std::runtime_error("Assistant response prefill is incompatible with enable_thinking.");
          }
author	Gabe Goodhart <redacted>
	Fri, 5 Sep 2025 20:31:24 +0000 (14:31 -0600)
committer	GitHub <redacted>
	Fri, 5 Sep 2025 20:31:24 +0000 (14:31 -0600)
common/chat.cpp		patch \| blob \| history
common/chat.h		patch \| blob \| history
tools/server/server.cpp		patch \| blob \| history
tools/server/utils.hpp		patch \| blob \| history