throw std::runtime_error("Invalid tool_choice: " + tool_choice);
}
+bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates) {
+ common_chat_templates_inputs dummy_inputs;
+ common_chat_msg msg;
+ msg.role = "user";
+ msg.content = "test";
+ dummy_inputs.messages = {msg};
+ dummy_inputs.enable_thinking = false;
+ const auto rendered_no_thinking = common_chat_templates_apply(chat_templates, dummy_inputs);
+ dummy_inputs.enable_thinking = true;
+ const auto rendered_with_thinking = common_chat_templates_apply(chat_templates, dummy_inputs);
+ return rendered_no_thinking.prompt != rendered_with_thinking.prompt;
+}
+
template <>
std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messages) {
std::vector<common_chat_msg> msgs;
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
+bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates);
+
// Parses a JSON array of messages in OpenAI's chat completion API format.
// T can be std::string containing JSON or nlohmann::ordered_json
template <class T> std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const T & messages);
metrics.init();
+ // thinking is enabled if:
+ // 1. It's not explicitly disabled (reasoning_budget == 0)
+ // 2. The chat template supports it
+ const bool enable_thinking = params_base.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get());
+ SRV_INF("Enable thinking? %d\n", enable_thinking);
+
oai_parser_opt = {
/* use_jinja */ params_base.use_jinja,
/* prefill_assistant */ params_base.prefill_assistant,
/* common_chat_templates */ chat_templates.get(),
/* allow_image */ mctx ? mtmd_support_vision(mctx) : false,
/* allow_audio */ mctx ? mtmd_support_audio (mctx) : false,
- /* enable_thinking */ params_base.reasoning_budget != 0,
+ /* enable_thinking */ enable_thinking,
};
}
if (body.contains(key) && !body.at(key).is_null()) {
try {
return body.at(key);
- } catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const &) {
- LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value\n", key.c_str(), json(default_value).type_name());
+ } catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const & err) {
+ LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value: %s\n", key.c_str(), json(default_value).type_name(), err.what());
return default_value;
}
} else {
inputs.chat_template_kwargs[item.key()] = item.value().dump();
}
+ // parse the "enable_thinking" kwarg to override the default value
+ auto enable_thinking_kwarg = json_value(inputs.chat_template_kwargs, "enable_thinking", std::string(""));
+ if (enable_thinking_kwarg == "true") {
+ inputs.enable_thinking = true;
+ } else if (enable_thinking_kwarg == "false") {
+ inputs.enable_thinking = false;
+ } else if (!enable_thinking_kwarg.empty() && enable_thinking_kwarg[0] == '"') {
+ throw std::runtime_error("invalid type for \"enable_thinking\" (expected boolean, got string)");
+ }
+
// if the assistant message appears at the end of list, we do not add end-of-turn token
// for ex. this can be useful to modify the reasoning process in reasoning models
bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && opt.prefill_assistant;
/* TODO: test this properly */
inputs.reasoning_format = COMMON_REASONING_FORMAT_NONE;
- if ( (!inputs.enable_thinking) || inputs.chat_template_kwargs.find("enable_thinking") != inputs.chat_template_kwargs.end()) {
+ if ( inputs.enable_thinking ) {
throw std::runtime_error("Assistant response prefill is incompatible with enable_thinking.");
}