}
auto msg = builder.result();
if (!is_partial) {
- LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat<json>({msg}).at(0).dump().c_str());
+ LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat({msg}).at(0).dump().c_str());
}
return msg;
}
mapper.from_ast(ctx.ast, result);
}
if (!is_partial) {
- LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat<json>({msg}).at(0).dump().c_str());
+ LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat({msg}).at(0).dump().c_str());
}
return msg;
}
#include "log.h"
#include "regex-partial.h"
-// #include <minja/chat-template.hpp>
-// #include <minja/minja.hpp>
-
#include "jinja/parser.h"
#include "jinja/value.h"
#include "jinja/runtime.h"
return !msg.content.empty() || !msg.tool_calls.empty();
}
-template <>
-json common_chat_msg::to_json_oaicompat() const
-{
- json message {
- {"role", "assistant"},
+json common_chat_msg::to_json_oaicompat(bool concat_typed_text) const {
+ if (!content.empty() && !content_parts.empty()) {
+ throw std::runtime_error("Cannot specify both content and content_parts");
+ }
+ json jmsg {
+ {"role", role},
};
+ if (!content.empty()) {
+ jmsg["content"] = content;
+ } else if (!content_parts.empty()) {
+ if (concat_typed_text) {
+ std::string text;
+ for (const auto & part : content_parts) {
+ if (part.type != "text") {
+ LOG_WRN("Ignoring content part type: %s\n", part.type.c_str());
+ continue;
+ }
+ if (!text.empty()) {
+ text += '\n';
+ }
+ text += part.text;
+ }
+ jmsg["content"] = text;
+ } else {
+ auto & parts = jmsg["content"] = json::array();
+ for (const auto & part : content_parts) {
+ parts.push_back({
+ {"type", part.type},
+ {"text", part.text},
+ });
+ }
+ }
+ } else {
+ jmsg["content"] = "";
+ }
if (!reasoning_content.empty()) {
- message["reasoning_content"] = reasoning_content;
+ jmsg["reasoning_content"] = reasoning_content;
}
- if (content.empty() && !tool_calls.empty()) {
- message["content"] = json();
- } else {
- message["content"] = content;
+ if (!tool_name.empty()) {
+ jmsg["name"] = tool_name;
+ }
+ if (!tool_call_id.empty()) {
+ jmsg["tool_call_id"] = tool_call_id;
}
if (!tool_calls.empty()) {
- auto arr = json::array();
- for (const auto & tc : tool_calls) {
- arr.push_back({
+ jmsg["tool_calls"] = json::array();
+ auto & jtool_calls = jmsg["tool_calls"];
+ for (const auto & tool_call : tool_calls) {
+ json tc {
{"type", "function"},
{"function", {
- {"name", tc.name},
- {"arguments", tc.arguments},
+ {"name", tool_call.name},
+ {"arguments", tool_call.arguments},
}},
- {"id", tc.id},
- // // Some templates generate and require an id (sometimes in a very specific format, e.g. Mistral Nemo).
- // // We only generate a random id for the ones that don't generate one by themselves
- // // (they also won't get to see it as their template likely doesn't use it, so it's all for the client)
- // {"id", tc.id.empty() ? gen_tool_call_id() : tc.id},
- });
+ };
+ if (!tool_call.id.empty()) {
+ tc["id"] = tool_call.id;
+ }
+ // Some templates generate and require an id (sometimes in a very specific format, e.g. Mistral Nemo).
+ // We only generate a random id for the ones that don't generate one by themselves
+ // (they also won't get to see it as their template likely doesn't use it, so it's all for the client)
+ // {"id", tc.id.empty() ? gen_tool_call_id() : tc.id},
+ jtool_calls.push_back(tc);
}
- message["tool_calls"] = arr;
}
- return message;
+
+ return jmsg;
}
std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const common_chat_msg & msg_prv, const common_chat_msg & msg_new) {
return rendered_no_thinking.prompt != rendered_with_thinking.prompt;
}
-template <>
std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messages) {
std::vector<common_chat_msg> msgs;
return msgs;
}
-template <>
json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msgs, bool concat_typed_text) {
json messages = json::array();
for (const auto & msg : msgs) {
- if (!msg.content.empty() && !msg.content_parts.empty()) {
- throw std::runtime_error("Cannot specify both content and content_parts");
- }
- json jmsg {
- {"role", msg.role},
- };
- if (!msg.content.empty()) {
- jmsg["content"] = msg.content;
- } else if (!msg.content_parts.empty()) {
- if (concat_typed_text) {
- std::string text;
- for (const auto & part : msg.content_parts) {
- if (part.type != "text") {
- LOG_WRN("Ignoring content part type: %s\n", part.type.c_str());
- continue;
- }
- if (!text.empty()) {
- text += '\n';
- }
- text += part.text;
- }
- jmsg["content"] = text;
- } else {
- auto & parts = jmsg["content"] = json::array();
- for (const auto & part : msg.content_parts) {
- parts.push_back({
- {"type", part.type},
- {"text", part.text},
- });
- }
- }
- } else {
- jmsg["content"] = "";
- }
- if (!msg.reasoning_content.empty()) {
- jmsg["reasoning_content"] = msg.reasoning_content;
- }
- if (!msg.tool_name.empty()) {
- jmsg["name"] = msg.tool_name;
- }
- if (!msg.tool_call_id.empty()) {
- jmsg["tool_call_id"] = msg.tool_call_id;
- }
- if (!msg.tool_calls.empty()) {
- auto & tool_calls = jmsg["tool_calls"] = json::array();
- for (const auto & tool_call : msg.tool_calls) {
- json tc {
- {"type", "function"},
- {"function", {
- {"name", tool_call.name},
- {"arguments", tool_call.arguments},
- }},
- };
- if (!tool_call.id.empty()) {
- tc["id"] = tool_call.id;
- }
- tool_calls.push_back(tc);
- }
- }
+ json jmsg = msg.to_json_oaicompat(concat_typed_text);
messages.push_back(jmsg);
}
return messages;
}
-template <>
-std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const std::string & messages) {
- return common_chat_msgs_parse_oaicompat(json::parse(messages));
-}
-
-template <>
std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const json & tools) {
std::vector<common_chat_tool> result;
return result;
}
-template <>
-std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const std::string & tools) {
- return common_chat_tools_parse_oaicompat(json::parse(tools));
-}
-
-template <>
json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools) {
if (tools.empty()) {
return json();
return result;
}
-template <> json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) {
+json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) {
json delta = json::object();
if (!diff.reasoning_content_delta.empty()) {
delta["reasoning_content"] = diff.reasoning_content_delta;
const struct common_chat_templates_inputs & inputs)
{
templates_params params;
- params.tools = common_chat_tools_to_json_oaicompat<json>(inputs.tools);
+ params.tools = common_chat_tools_to_json_oaicompat(inputs.tools);
const auto & tmpl = params.tools.is_array() && tmpls->template_tool_use
? *tmpls->template_tool_use
: *tmpls->template_default;
const auto & src = tmpl.source();
const auto & caps = tmpl.original_caps();
- params.messages = common_chat_msgs_to_json_oaicompat<json>(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content);
+ params.messages = common_chat_msgs_to_json_oaicompat(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content);
params.add_generation_prompt = inputs.add_generation_prompt;
params.tool_choice = inputs.tool_choice;
params.reasoning_format = inputs.reasoning_format;
src.find("<arg_value>") != std::string::npos &&
params.json_schema.is_null()) {
workaround::func_args_not_string(params.messages);
+ if (!params.extra_context.contains("clear_thinking")) {
+ // by default, do not clear reasoning_content (added since GLM-4.7)
+ params.extra_context["clear_thinking"] = false;
+ }
return common_chat_params_init_glm_4_5(tmpl, params);
}
? common_chat_templates_apply_jinja(tmpls, inputs)
: common_chat_templates_apply_legacy(tmpls, inputs);
}
+
+std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_templates * chat_templates) {
+ GGML_ASSERT(chat_templates != nullptr);
+ GGML_ASSERT(chat_templates->template_default != nullptr);
+ return chat_templates->template_default->caps.to_map();
+}
#include <vector>
#include <map>
+#include <nlohmann/json_fwd.hpp>
+
struct common_chat_templates;
struct common_chat_tool_call {
std::string type;
std::string text;
+ // TODO @ngxson : no known chat templates support reasoning_content in content parts yet
+ // this can be useful for models with interleaved thinking (like Kimi-K2)
+ // if you see any templates explicitly support this, please ping me
+ // std::string reasoning_content;
+
bool operator==(const common_chat_msg_content_part & other) const {
return type == other.type && text == other.text;
}
std::string tool_name;
std::string tool_call_id;
- template <class T> T to_json_oaicompat() const;
+ nlohmann::ordered_json to_json_oaicompat(bool concat_typed_text = false) const;
bool empty() const {
return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() && tool_name.empty() && tool_call_id.empty();
bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates);
// Parses a JSON array of messages in OpenAI's chat completion API format.
-// T can be std::string containing JSON or nlohmann::ordered_json
-template <class T> std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const T & messages);
-template <class T> T common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msgs, bool concat_typed_text = false);
+std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const nlohmann::ordered_json & messages);
+nlohmann::ordered_json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msgs, bool concat_typed_text = false);
+
+std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const nlohmann::ordered_json & tools);
+nlohmann::ordered_json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);
-// Parses a JSON array of tools in OpenAI's chat completion tool call API format.
-// T can be std::string containing JSON or nlohmann::ordered_json
-template <class T> std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const T & tools);
-template <class T> T common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);
+nlohmann::ordered_json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff);
-template <class T> T common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff);
+// get template caps, useful for reporting to server /props endpoint
+std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_templates * chat_templates);
ops.c_str());
}
+std::map<std::string, bool> caps::to_map() const {
+ return {
+ {"requires_typed_content", requires_typed_content},
+ {"supports_tools", supports_tools},
+ {"supports_tool_calls", supports_tool_calls},
+ {"supports_parallel_tool_calls", supports_parallel_tool_calls},
+ {"supports_system_role", supports_system_role},
+ {"supports_preserve_reasoning", supports_preserve_reasoning},
+ };
+}
+
std::string caps::to_string() const {
std::ostringstream ss;
ss << "Caps(\n";
- ss << " requires_typed_content=" << requires_typed_content << "\n";
- ss << " supports_tools=" << supports_tools << "\n";
- ss << " supports_tool_calls=" << supports_tool_calls << "\n";
- ss << " supports_parallel_tool_calls=" << supports_parallel_tool_calls << "\n";
- ss << " supports_system_role=" << supports_system_role << "\n";
+ for (const auto & [key, value] : to_map()) {
+ ss << " " << key << "=" << (value ? "true" : "false") << "\n";
+ }
ss << ")";
return ss.str();
}
}
);
+ // case: preserve reasoning content in chat history
+ caps_try_execute(
+ prog,
+ [&]() {
+ // messages
+ return json::array({
+ {
+ {"role", "user"},
+ {"content", "User message"}
+ },
+ {
+ {"role", "assistant"},
+ {"content", "Assistant message"},
+ {"reasoning_content", "Reasoning content"}
+ },
+ {
+ {"role", "user"},
+ {"content", "User message"}
+ },
+ });
+ },
+ [&]() {
+ // tools
+ return json::array();
+ },
+ [&](bool, value & messages, value &) {
+ auto & content = messages->at(1)->at("reasoning_content");
+ caps_print_stats(content, "messages[1].reasoning_content");
+ if (content->stats.used) {
+ result.supports_preserve_reasoning = true;
+ }
+ }
+ );
+
JJ_DEBUG("%s\n", result.to_string().c_str());
return result;
#include "runtime.h"
#include <string>
+#include <map>
namespace jinja {
bool supports_tool_calls = true;
bool supports_system_role = true;
bool supports_parallel_tool_calls = true;
+ bool supports_preserve_reasoning = false; // support assistant message with reasoning_content
bool requires_typed_content = false; // default: use string content
+ // for reporting on server
+ std::map<std::string, bool> to_map() const;
+
// for debugging
std::string to_string() const;
};
caps caps_get(jinja::program & prog);
-void debug_print_caps(const caps & c);
} // namespace jinja
for (size_t i = 1; i <= raw_message.size(); ++i) {
auto curr_msg = parse_msg(std::string(utf8_truncate_safe_view(std::string_view(raw_message).substr(0, i))));
if (curr_msg == simple_assist_msg("")) continue;
- LOG_INF("Streaming msg: %s\n", common_chat_msgs_to_json_oaicompat<json>({curr_msg}).dump().c_str());
+ LOG_INF("Streaming msg: %s\n", common_chat_msgs_to_json_oaicompat({curr_msg}).dump().c_str());
for (auto diff: common_chat_msg_diff::compute_diffs(last_msg, curr_msg)) {
- LOG_INF("Streaming diff: %s\n", common_chat_msg_diff_to_json_oaicompat<json>(diff).dump().c_str());
+ LOG_INF("Streaming diff: %s\n", common_chat_msg_diff_to_json_oaicompat(diff).dump().c_str());
if (!diff.reasoning_content_delta.empty()) {
merged.reasoning_content += diff.reasoning_content_delta;
}
merged.tool_calls.back().arguments += diff.tool_call_delta.arguments;
}
}
- LOG_INF("Streaming merged: %s\n", common_chat_msgs_to_json_oaicompat<json>({merged}).dump().c_str());
+ LOG_INF("Streaming merged: %s\n", common_chat_msgs_to_json_oaicompat({merged}).dump().c_str());
}
assert_msg_equals(curr_msg, merged, true);
last_msg = curr_msg;
message_assist_call_code_interpreter,
};
for (const auto & msg : msgs) {
- auto oai_json = common_chat_msgs_to_json_oaicompat<json>({msg});
+ auto oai_json = common_chat_msgs_to_json_oaicompat({msg});
auto msgs2 = common_chat_msgs_parse_oaicompat(oai_json);
assert_equals((size_t) 1, msgs2.size());
auto msg2 = msgs2[0];
" }\n"
"]"
),
- common_chat_msgs_to_json_oaicompat<json>({message_user_parts}).dump(2));
+ common_chat_msgs_to_json_oaicompat({message_user_parts}).dump(2));
assert_equals(
std::string(
" }\n"
"]"
),
- common_chat_msgs_to_json_oaicompat<json>({message_assist_call_python}).dump(2));
+ common_chat_msgs_to_json_oaicompat({message_assist_call_python}).dump(2));
auto res = common_chat_msgs_parse_oaicompat(json::parse("[{\"role\": \"assistant\", \"tool_calls\": []}]"));
assert_equals<size_t>(1, res.size());
};
for (const auto & tool : tools) {
- auto oai_json = common_chat_tools_to_json_oaicompat<json>({tool});
+ auto oai_json = common_chat_tools_to_json_oaicompat({tool});
auto tools2 = common_chat_tools_parse_oaicompat(oai_json);
assert_equals((size_t) 1, tools2.size());
auto tool2 = tools2[0];
" }\n"
"]"
),
- common_chat_tools_to_json_oaicompat<json>({special_function_tool}).dump(2));
+ common_chat_tools_to_json_oaicompat({special_function_tool}).dump(2));
{
auto tools_no_params = common_chat_tools_parse_oaicompat(json::parse(
"total_slots": 1,
"model_path": "../models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf",
"chat_template": "...",
+ "chat_template_caps": {},
"modalities": {
"vision": false
},
- `total_slots` - the total number of slots for process requests (defined by `--parallel` option)
- `model_path` - the path to model file (same with `-m` argument)
- `chat_template` - the model's original Jinja2 prompt template
+- `chat_template_caps` - capabilities of the chat template (see `common/jinja/caps.h` for more info)
- `modalities` - the list of supported modalities
- `is_sleeping` - sleeping status, see [Sleeping on idle](#sleeping-on-idle)
The total number of tokens in context is equal to `prompt_n + cache_n + predicted_n`
+*Reasoning support*
+
+The server supports parsing and returning reasoning via the `reasoning_content` field, similar to Deepseek API.
+
+Reasoning input (preserve reasoning in history) is also supported by some specific templates. For more details, please refer to [PR#18994](https://github.com/ggml-org/llama.cpp/pull/18994).
+
### POST `/v1/responses`: OpenAI-compatible Responses API
*Options:*
/* pooling_type */ llama_pooling_type(impl->ctx),
/* chat_params */ impl->chat_params,
+ /* chat_template_caps */ common_chat_templates_get_caps(impl->chat_params.tmpls.get()),
/* bos_token_str */ bos_token_str,
/* eos_token_str */ eos_token_str,
{ "webui", params.webui },
{ "webui_settings", meta->json_webui_settings },
{ "chat_template", tmpl_default },
+ { "chat_template_caps", meta->chat_template_caps },
{ "bos_token", meta->bos_token_str },
{ "eos_token", meta->eos_token_str },
{ "build_info", meta->build_info },
// chat params
server_chat_params & chat_params;
+ std::map<std::string, bool> chat_template_caps;
// tokens
std::string bos_token_str;
json choice {
{"finish_reason", finish_reason},
{"index", index},
- {"message", msg.to_json_oaicompat<json>()},
+ {"message", msg.to_json_oaicompat()},
};
if (!stream && probs_output.size() > 0) {
json {
{"finish_reason", nullptr},
{"index", 0},
- {"delta", common_chat_msg_diff_to_json_oaicompat<json>(diff)},
+ {"delta", common_chat_msg_diff_to_json_oaicompat(diff)},
},
})},
{"created", t},
}
for (const auto & diff : oaicompat_msg_diffs) {
- add_delta(common_chat_msg_diff_to_json_oaicompat<json>(diff));
+ add_delta(common_chat_msg_diff_to_json_oaicompat(diff));
}
if (!deltas.empty()) {