).set_sparam());
add_opt(common_arg(
{"--grammar"}, "GRAMMAR",
- string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sampling.grammar.c_str()),
+ "BNF-like grammar to constrain generations (see samples in grammars/ dir)",
[](common_params & params, const std::string & value) {
- params.sampling.grammar = value;
+ params.sampling.grammar = {COMMON_GRAMMAR_TYPE_USER, value};
}
).set_sparam());
add_opt(common_arg(
{"--grammar-file"}, "FNAME",
"file to read grammar from",
[](common_params & params, const std::string & value) {
- params.sampling.grammar = read_file(value);
+ params.sampling.grammar = {COMMON_GRAMMAR_TYPE_USER, read_file(value)};
}
).set_sparam());
add_opt(common_arg(
{"-j", "--json-schema"}, "SCHEMA",
"JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
[](common_params & params, const std::string & value) {
- params.sampling.grammar = json_schema_to_grammar(json::parse(value));
+ params.sampling.grammar = {COMMON_GRAMMAR_TYPE_OUTPUT_FORMAT, json_schema_to_grammar(json::parse(value))};
}
).set_sparam());
add_opt(common_arg(
std::istreambuf_iterator<char>(),
std::back_inserter(schema)
);
- params.sampling.grammar = json_schema_to_grammar(json::parse(schema));
+ params.sampling.grammar = {COMMON_GRAMMAR_TYPE_OUTPUT_FORMAT, json_schema_to_grammar(json::parse(schema))};
}
).set_sparam());
add_opt(common_arg(
+#include "chat-auto-parser-helpers.h"
#include "chat-auto-parser.h"
#include "chat-peg-parser.h"
#include "chat.h"
namespace autoparser {
-parser_build_context::parser_build_context(common_chat_peg_builder & p, const templates_params & inputs) :
+parser_build_context::parser_build_context(common_chat_peg_builder & p, const generation_params & inputs) :
p(p),
inputs(inputs),
reasoning_parser(p.eps()) {}
common_chat_params peg_generator::generate_parser(const common_chat_template & tmpl,
- const struct templates_params & inputs) {
+ const struct generation_params & inputs) {
// Run differential analysis to extract template structure
struct autoparser autoparser;
autoparser.analyze_template(tmpl);
}
common_chat_params peg_generator::generate_parser(const common_chat_template & tmpl,
- const struct templates_params & inputs,
+ const struct generation_params & inputs,
const autoparser & autoparser) {
- // Build the parser using the analysis results
- auto parser = autoparser.build_parser(inputs);
-
// Create the result structure
common_chat_params data;
data.prompt = common_chat_template_direct_apply(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
data.preserved_tokens = autoparser.preserved_tokens;
- data.parser = parser.save();
+
+ auto parser = autoparser.build_parser(inputs);
+ data.parser = parser.save();
// Build grammar if tools are present
bool has_tools =
return data;
}
-common_peg_arena autoparser::build_parser(const templates_params & inputs) const {
+common_peg_arena autoparser::build_parser(const generation_params & inputs) const {
if (!analysis_complete) {
throw std::invalid_argument("Cannot call build_parser on autoparser without performing analysis first, call analyze_template(...)");
}
return build_chat_peg_parser([&](common_chat_peg_builder & p) {
- // If the template uses Python dict format (single-quoted strings in JSON structures),
- // pre-register a json-string rule that accepts both quote styles. This must happen
- // before any call to p.json() so that all JSON parsing inherits the flexible rule.
- if (tools.format.uses_python_dicts) {
- p.rule("json-string", p.quoted_string());
- }
-
parser_build_context ctx(p, inputs);
bool extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
- bool enable_thinking = inputs.enable_thinking;
- ctx.extracting_reasoning = extract_reasoning && enable_thinking && reasoning.mode != reasoning_mode::NONE;
+ ctx.extracting_reasoning = extract_reasoning && reasoning.mode != reasoning_mode::NONE;
ctx.content = &content;
// Build reasoning parser
ctx.reasoning_parser = reasoning.build_parser(ctx);
+ auto parser = p.eps();
+
bool has_tools = inputs.tools.is_array() && !inputs.tools.empty();
bool has_response_format = inputs.json_schema.is_object() && !inputs.json_schema.empty();
if (has_response_format) {
auto response_format = p.rule("response-format", p.content(p.schema(p.json(), "response-format-schema", inputs.json_schema)));
- return ctx.reasoning_parser + p.space() + p.choice({
+ parser = ctx.reasoning_parser + p.space() + p.choice({
p.literal("```json") + p.space() + response_format + p.space() + p.literal("```"),
response_format
}) + p.end();
+ } else if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && jinja_caps.supports_tool_calls) {
+ parser = tools.build_parser(ctx);
+ } else {
+ parser = content.build_parser(ctx);
}
-
- if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && jinja_caps.supports_tool_calls) {
- return tools.build_parser(ctx);
- }
-
- return content.build_parser(ctx);
+ parser = wrap_for_generation_prompt(p, parser, inputs, reasoning.start);
+ return parser;
});
}
return p.eps();
}
- bool thinking_forced_open = (mode == reasoning_mode::FORCED_OPEN);
- bool thinking_forced_closed = (mode == reasoning_mode::FORCED_CLOSED);
-
- if (thinking_forced_open || thinking_forced_closed) {
- // Thinking is forced open OR forced closed with enable_thinking=true
- // In both cases, expect only the closing tag (opening was in template)
- // However, since we might have incorrectly detected the open/close pattern,
- // we admit an optional starting marker
- return p.optional(p.literal(start)) + p.reasoning(p.until(end)) + end;
- }
if (mode == reasoning_mode::TAG_BASED || mode == reasoning_mode::TOOLS_ONLY) {
- // Standard tag-based reasoning OR tools-only mode (reasoning appears with tools)
- // Both use the same tag-based pattern if markers are available
- if (!start.empty() && !end.empty()) {
- return p.optional(start + p.reasoning(p.until(end)) + end);
+ if (!end.empty()) {
+ if (!start.empty()) {
+ // Standard tag-based: optional(<think>reasoning</think>)
+ return p.optional(start + p.reasoning(p.until(end)) + end + p.space());
+ }
+ // Delimiter-style (empty start)
+ return p.optional(p.reasoning(p.until(end)) + end + p.space());
}
- } else if (mode == reasoning_mode::DELIMITER) {
- return p.optional(p.reasoning(p.until(end)) + end);
}
return p.eps();
"tool-" + name + "-arg-" + param_name + "-schema",
param_schema, true)) :
p.tool_arg_json_value(p.schema(
- p.json(), "tool-" + name + "-arg-" + param_name + "-schema", param_schema, format.uses_python_dicts)) +
+ p.json(), "tool-" + name + "-arg-" + param_name + "-schema", param_schema, false)) +
p.space()) +
p.tool_arg_close(p.literal(arguments.value_suffix)));
#include "chat-auto-parser-helpers.h"
#include "chat-auto-parser.h"
+#include "chat-peg-parser.h"
#include "chat.h"
#include "log.h"
#include "nlohmann/json.hpp"
+#include "peg-parser.h"
#include <cctype>
#include <numeric>
return result;
}
+common_peg_parser wrap_for_generation_prompt(common_chat_peg_builder & p,
+ const common_peg_parser & prs,
+ const autoparser::generation_params & inputs,
+ const std::string & reasoning_start) {
+ auto parser = prs;
+ if (!inputs.generation_prompt.empty()) {
+ size_t end_pos = inputs.generation_prompt.size();
+ if (!reasoning_start.empty() && inputs.generation_prompt.find(reasoning_start) != std::string::npos) {
+ end_pos = inputs.generation_prompt.find(reasoning_start);
+ }
+ std::string cut_genprompt = inputs.generation_prompt.substr(0, end_pos);
+ parser = p.literal(cut_genprompt) + parser;
+ }
+ return parser;
+}
+
namespace autoparser {
std::string apply_template(const common_chat_template & tmpl, const template_params & params) {
- templates_params tmpl_params;
+ generation_params tmpl_params;
tmpl_params.messages = params.messages;
tmpl_params.tools = params.tools;
tmpl_params.add_generation_prompt = params.add_generation_prompt;
#pragma once
#include "chat-auto-parser.h"
+#include "peg-parser.h"
#include <functional>
#include <optional>
#include <string>
// (MARKER, "</function>"), (MARKER, "</tool_call>") ]
std::vector<segment> prune_whitespace_segments(const std::vector<segment> & segments);
+// Wrap parser with generation prompt parser
+common_peg_parser wrap_for_generation_prompt(common_chat_peg_builder & p,
+ const common_peg_parser & prs,
+ const autoparser::generation_params & inputs,
+ const std::string & reasoning_start = {});
namespace autoparser {
// Apply a template with the given parameters, returning the rendered string (empty on failure)
// High-level params for parser generation
// ============================================================================
-struct templates_params {
+struct generation_params {
json messages;
json tools;
common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
bool add_generation_prompt = false;
bool enable_thinking = true;
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
+ std::string generation_prompt;
json extra_context;
bool add_bos = false;
bool add_eos = false;
// Reasoning handling mode (derived from R1-R3 comparisons)
enum class reasoning_mode {
NONE, // No reasoning markers detected
- TAG_BASED, // Standard tag-based: <think>...</think>
- DELIMITER, // Delimiter-based: [BEGIN FINAL RESPONSE] (reasoning ends at delimiter)
- FORCED_OPEN, // Template ends with open reasoning tag (empty start, non-empty end)
- FORCED_CLOSED, // Template ends with open reasoning tag on enabled thinking but
- // with both opened and closed tag for disabled thinking
+ TAG_BASED, // Tag-based: <think>...</think> (start can be empty for delimiter-style)
TOOLS_ONLY // Only reason on tool calls, not on normal content
};
return os << "NONE";
case reasoning_mode::TAG_BASED:
return os << "TAG_BASED";
- case reasoning_mode::DELIMITER:
- return os << "DELIMITER";
- case reasoning_mode::FORCED_OPEN:
- return os << "FORCED_OPEN";
- case reasoning_mode::FORCED_CLOSED:
- return os << "FORCED_CLOSED";
case reasoning_mode::TOOLS_ONLY:
return os << "TOOLS_ONLY";
default:
bool fun_name_is_key = false; // In JSON format function name is JSON key, i.e. { "<funname>": { ... arguments ... } }
bool tools_array_wrapped = false; // Tool calls wrapped in JSON array [...]
- bool uses_python_dicts = false; // Tool call args use Python dict format (single-quoted strings)
std::string function_field = "function";
std::string name_field = "name";
struct parser_build_context {
common_chat_peg_builder & p;
- const templates_params & inputs;
+ const generation_params & inputs;
common_peg_parser reasoning_parser;
bool extracting_reasoning = false;
const analyze_content * content = nullptr;
- parser_build_context(common_chat_peg_builder & p, const templates_params & inputs);
+ parser_build_context(common_chat_peg_builder & p, const generation_params & inputs);
};
// ============================================================================
analyze_reasoning() = default;
analyze_reasoning(const common_chat_template & tmpl, bool supports_tools);
+ analyze_reasoning(std::string start_, std::string end_) : start(std::move(start_)), end(std::move(end_)) {}
common_peg_parser build_parser(parser_build_context & ctx) const override;
void analyze_template(const common_chat_template & tmpl);
// Build the PEG parser for this template
- common_peg_arena build_parser(const templates_params & inputs) const;
+ common_peg_arena build_parser(const generation_params & inputs) const;
private:
// Collect tokens from entire analysis to preserve
class peg_generator {
public:
static common_chat_params generate_parser(const common_chat_template & tmpl,
- const struct templates_params & inputs);
+ const struct generation_params & inputs);
static common_chat_params generate_parser(const common_chat_template & tmpl,
- const struct templates_params & inputs,
+ const struct generation_params & inputs,
const autoparser & autoparser);
};
#include "chat-auto-parser-helpers.h"
#include "chat-peg-parser.h"
#include "chat.h"
+#include "common.h"
#include "log.h"
#include "nlohmann/json.hpp"
#include "peg-parser.h"
[](const common_chat_template & tmpl, autoparser & analysis) -> void {
if (tmpl.src.find("content.split('</think>')") != std::string::npos &&
tmpl.src.find("reasoning_content") == std::string::npos &&
+ tmpl.src.find("<SPECIAL_12>") == std::string::npos &&
analysis.reasoning.mode == reasoning_mode::NONE) {
- analysis.reasoning.mode = reasoning_mode::FORCED_OPEN;
+ analysis.reasoning.mode = reasoning_mode::TAG_BASED;
analysis.reasoning.start = "<think>";
analysis.reasoning.end = "</think>";
analysis.preserved_tokens.push_back("<think>");
LOG_DBG("func_name_prefix: '%s'\n", tools.function.name_prefix.c_str());
LOG_DBG("func_name_suffix: '%s'\n", tools.function.name_suffix.c_str());
LOG_DBG("func_close: '%s'\n", tools.function.close.c_str());
- LOG_DBG("python_dict_format: %s\n", tools.format.uses_python_dicts ? "true" : "false");
LOG_DBG("arg_name_prefix: '%s'\n", tools.arguments.name_prefix.c_str());
LOG_DBG("arg_name_suffix: '%s'\n", tools.arguments.name_suffix.c_str());
LOG_DBG("arg_value_prefix: '%s'\n", tools.arguments.value_prefix.c_str());
}
if (result.result.success()) {
if (!result.tags["pre"].empty() && !result.tags["post"].empty()) {
- if (parser_wrapped.parse_anywhere_and_extract(diff.right).result.success()) { // both tags in the diff = no forced close
- mode = reasoning_mode::TAG_BASED;
- } else {
- mode = reasoning_mode::FORCED_CLOSED;
- }
+ mode = reasoning_mode::TAG_BASED;
start = trim_whitespace(result.tags["pre"]);
- end = result.tags["post"];
+ end = trim_trailing_whitespace(result.tags["post"]);
} else if (!result.tags["post"].empty()) {
- mode = reasoning_mode::DELIMITER;
- end = result.tags["post"];
+ mode = reasoning_mode::TAG_BASED;
+ end = trim_trailing_whitespace(result.tags["post"]);
}
}
}
const auto & diff = comparison->diff;
std::string left_trimmed = trim_whitespace(diff.left);
+ std::string right_trimmed = trim_whitespace(diff.right);
if (left_trimmed.empty() && !diff.right.empty()) {
- std::string right_trimmed = trim_whitespace(diff.right);
-
if (!right_trimmed.empty() && string_ends_with(comparison->output_B, right_trimmed)) {
if (start.empty()) {
start = right_trimmed;
- mode = reasoning_mode::FORCED_OPEN;
+ mode = reasoning_mode::TAG_BASED;
}
}
- }
-
- if (start.empty() && !end.empty()) {
- mode = reasoning_mode::DELIMITER;
- }
-
- // Check for FORCED_CLOSED: when enable_thinking=false produces both start and end markers,
- // but enable_thinking=true produces only the start marker
- if (!comparison->output_A.empty() && !comparison->output_B.empty()) {
- auto parser_start = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
- return p.literal(start) + p.space() + p.literal(end) + p.rest();
- });
- auto parser_start_end = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
- return p.tag("pre", p.literal(start)) + p.space() + p.negate(p.literal(end)) + p.rest();
- });
- if (!start.empty() && parser_start_end.parse_anywhere_and_extract(comparison->output_A).result.success() &&
- parser_start.parse_anywhere_and_extract(comparison->output_B).result.success()) {
- mode = reasoning_mode::FORCED_CLOSED;
- } else if (!end.empty()) { // we extract the starting marker now since we didn't get it earlier
- auto result = parser_start_end.parse_anywhere_and_extract(comparison->output_A);
- if (result.result.success()) {
- start = result.tags["pre"];
- mode = reasoning_mode::FORCED_CLOSED;
+ } else if (right_trimmed.empty() && !diff.left.empty()) {
+ if (!left_trimmed.empty() && string_ends_with(comparison->output_A, left_trimmed)) {
+ if (end.empty()) {
+ auto seg = prune_whitespace_segments(segmentize_markers(comparison->output_A));
+ if (seg.size() >= 2 && seg[seg.size() - 1].value == left_trimmed && seg[seg.size() - 2].type == segment_type::MARKER) {
+ start = seg[seg.size() - 2].value;
+ }
+ end = left_trimmed;
+ mode = reasoning_mode::TAG_BASED;
}
}
}
- if (start.empty() && end.empty()) { // we might still have the case of "just open" and "just close"
- if (!diff.left.empty() && !diff.right.empty()) {
- auto seg_A = segmentize_markers(trim_trailing_whitespace(diff.left));
- auto seg_B = segmentize_markers(trim_trailing_whitespace(diff.right));
- if (seg_A.size() == 1 && seg_B.size() == 1) {
- mode = reasoning_mode::FORCED_CLOSED;
- start = seg_B[0].value;
- end = seg_A[0].value;
- }
- }
+ if (mode == reasoning_mode::NONE && start.empty() && !end.empty()) {
+ mode = reasoning_mode::TAG_BASED;
}
}
auto result = parser_wrapped.parse_anywhere_and_extract(comparison->output_B);
if (result.result.success()) {
start = result.tags["pre"];
- end = result.tags["post"];
+ end = trim_trailing_whitespace(result.tags["post"]);
} else {
auto parser_delimiter = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
return p.literal(reasoning_content) + p.space() + p.optional(p.tag("post", (p.marker() + p.space())));
});
result = parser_delimiter.parse_anywhere_and_extract(comparison->output_B);
if (result.result.success()) {
- end = result.tags["post"];
+ end = trim_trailing_whitespace(result.tags["post"]);
} else {
LOG_DBG(ANSI_ORANGE "%s: Unable to extracft reasoning markers, falling back to reasoning = NONE\n" ANSI_RESET, __func__);
mode = reasoning_mode::NONE;
return;
}
- enum class json_quote_style { NONE, DOUBLE_QUOTES, SINGLE_QUOTES };
-
- auto in_json_haystack = [&haystack](const std::string & needle) -> json_quote_style {
+ auto in_json_haystack = [&haystack](const std::string & needle) -> bool {
auto parser = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
return p.choice({ p.literal("{"), p.literal(":") }) << p.choice({
- p.tag("sq", p.literal("'") + p.literal(needle) + p.literal("'")),
p.tag("dq", p.literal("\"") + p.literal(needle) + p.literal("\"")) });
});
auto result = parser.parse_anywhere_and_extract(haystack);
- if (!result.result.success()) {
- return json_quote_style::NONE;
- }
- return result.tags.count("sq") && !result.tags["sq"].empty()
- ? json_quote_style::SINGLE_QUOTES
- : json_quote_style::DOUBLE_QUOTES;
+ return result.result.success();
};
auto fun_quote = in_json_haystack(fun_name_needle);
auto arg_quote = in_json_haystack(arg_name_needle);
- if (fun_quote != json_quote_style::NONE) {
+ if (fun_quote) {
// no need to check further, we're in JSON land
format.mode = tool_format::JSON_NATIVE;
- format.uses_python_dicts = (fun_quote == json_quote_style::SINGLE_QUOTES);
- } else if (arg_quote != json_quote_style::NONE) {
+ } else if (arg_quote) {
format.mode = tool_format::TAG_WITH_JSON;
- format.uses_python_dicts = (arg_quote == json_quote_style::SINGLE_QUOTES);
} else {
format.mode = tool_format::TAG_WITH_TAGGED;
}
result.tool_calls.push_back(pending_tool_call.value());
pending_tool_call.reset();
}
+
+ // Discard whitespace-only reasoning content (e.g. from <think></think> prefill)
+ if (!result.reasoning_content.empty()) {
+ bool all_whitespace = true;
+ for (char c : result.reasoning_content) {
+ if (c != ' ' && c != '\n' && c != '\r' && c != '\t') {
+ all_whitespace = false;
+ break;
+ }
+ }
+ if (all_whitespace) {
+ result.reasoning_content.clear();
+ }
+ }
}
void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
#include "chat.h"
+#include "chat-auto-parser-helpers.h"
#include "chat-auto-parser.h"
#include "chat-peg-parser.h"
#include "common.h"
#include <sstream>
#include <stdexcept>
#include <string>
+#include <utility>
#include <vector>
using json = nlohmann::ordered_json;
std::string common_chat_template_direct_apply(
const common_chat_template & tmpl,
- const autoparser::templates_params & inputs,
+ const autoparser::generation_params & inputs,
const std::optional<json> & messages_override,
const std::optional<json> & tools_override,
const std::optional<json> & additional_context) {
}
static common_chat_params common_chat_params_init_ministral_3(const common_chat_template & tmpl,
- const autoparser::templates_params & inputs) {
+ const autoparser::generation_params & inputs) {
common_chat_params data;
// Build up messages to follow the format: https://huggingface.co/mistralai/Ministral-3-14B-Reasoning-2512/blob/main/chat_template.jinja
// Response format parser
if (inputs.json_schema.is_object() && !inputs.json_schema.empty()) {
// Ministral wants to emit json surrounded by code fences
- return reasoning << "```json" << p.content(p.schema(p.json(), "response-format", inputs.json_schema))
- << "```";
+ return wrap_for_generation_prompt(p, reasoning << "```json" << p.content(p.schema(p.json(), "response-format", inputs.json_schema)) << "```",
+ inputs, "[THINK]");
}
// Tool call parser
auto max_calls = inputs.parallel_tool_calls ? -1 : 1;
auto tool_calls = p.trigger_rule("tool-call", p.repeat("[TOOL_CALLS]" + tool_choice, min_calls, max_calls));
- return reasoning << p.content(p.until("[TOOL_CALLS]")) << tool_calls;
+ return wrap_for_generation_prompt(p, reasoning << p.content(p.until("[TOOL_CALLS]")) << tool_calls,
+ inputs, "[THINK]");
}
// Content only parser
include_grammar = false;
- return reasoning << p.content(p.rest());
+ return wrap_for_generation_prompt(p, reasoning << p.content(p.rest()), inputs, "[THINK]");
});
data.parser = parser.save();
}
static common_chat_params common_chat_params_init_gpt_oss(const common_chat_template & tmpl,
- const autoparser::templates_params & inputs) {
+ const autoparser::generation_params & inputs) {
common_chat_params data;
// Copy reasoning to the "thinking" field as expected by the gpt-oss template
p.literal("<|channel|>final") + constraint + p.literal("<|message|>") +
p.content(p.schema(p.json(), "response-format-schema", inputs.json_schema)));
- return response_format | (analysis + p.zero_or_more(start + analysis) + start + response_format);
+ return wrap_for_generation_prompt(p, response_format | (analysis + p.zero_or_more(start + analysis) + start + response_format),
+ inputs, "<|channel|>");
}
if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
return tool_call | ( any + p.zero_or_more(start + any) + start + tool_call);
}
- return tool_call | final_msg | (any + p.zero_or_more(start + any) + start + (tool_call | final_msg));
+ return wrap_for_generation_prompt(p, tool_call | final_msg | (any + p.zero_or_more(start + any) + start + (tool_call | final_msg)),
+ inputs, "<|channel|>");
}
- return final_msg | (any + p.zero_or_more(start + any) + start + final_msg);
+ return wrap_for_generation_prompt(p, final_msg | (any + p.zero_or_more(start + any) + start + final_msg),
+ inputs, "<|channel|>");
});
data.parser = parser.save();
// Functionary v3.2 - uses recipient-based format: >>>recipient\n{content}
static common_chat_params common_chat_params_init_functionary_v3_2(const common_chat_template & tmpl,
- const autoparser::templates_params & inputs) {
+ const autoparser::generation_params & inputs) {
common_chat_params data;
data.prompt = common_chat_template_direct_apply(tmpl, inputs);
// Build content parser for >>>all\n{content}
// When tools are present, content stops before the next ">>>" (tool call)
// When no tools, content goes until end
- auto content_until_tool = p.literal(">>>all\n") + p.content(p.until(">>>"));
- auto content_until_end = p.literal(">>>all\n") + p.content(p.rest());
+ auto content_until_tool = p.literal("all\n") + p.content(p.until(">>>"));
+ auto content_until_end = p.literal("all\n") + p.content(p.rest());
// If no tools or tool_choice is NONE, just parse content
if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
// When no tools, just match the prefix and capture everything after
- return content_until_end + p.end();
+ return wrap_for_generation_prompt(p, content_until_end + p.end(), inputs);
}
// Build tool call parsers for each available function
// Tool format: >>>function_name\n{json_args}
auto tool_parser = p.tool(
- p.tool_open(p.literal(">>>") + p.tool_name(p.literal(name)) + p.literal("\n")) +
+ p.tool_open(p.tool_name(p.literal(name)) + p.literal("\n")) +
p.tool_args(p.schema(p.json(), "tool-" + name + "-schema", schema))
);
auto tools_only = p.trigger_rule("tools", p.one_or_more(tool_choice));
auto content_and_tools = content_until_tool + tools_only;
+ auto ret = p.eps();
if (inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED) {
if (inputs.parallel_tool_calls) {
- return p.choice({ content_and_tools, tools_only }) + p.end();
+ ret = p.choice({ content_and_tools, tools_only }) + p.end();
+ } else {
+ ret = p.choice({ content_until_tool + tool_choice, tools_only }) + p.end();
}
- return p.choice({ content_until_tool + tool_choice, tools_only }) + p.end();
- }
- if (inputs.parallel_tool_calls) {
- return p.choice({ content_and_tools, content_only, tools_only }) + p.end();
+ } else if (inputs.parallel_tool_calls) {
+ ret = p.choice({ content_and_tools, content_only, tools_only }) + p.end();
+ } else {
+ auto content_and_tool = content_until_tool + tool_choice;
+ ret = p.choice({ content_and_tool, content_only, tool_choice }) + p.end();
}
- auto content_and_tool = content_until_tool + tool_choice;
- return p.choice({ content_and_tool, content_only, tool_choice }) + p.end();
+ return wrap_for_generation_prompt(p, ret, inputs);
});
data.parser = parser.save();
// Kimi K2 Thinking - uses unique tool call ID format: functions.<name>:<index>
// The ID contains both the function name and an incrementing counter
static common_chat_params common_chat_params_init_kimi_k2(const common_chat_template & tmpl,
- const autoparser::templates_params & inputs) {
+ const autoparser::generation_params & inputs) {
common_chat_params data;
data.prompt = common_chat_template_direct_apply(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
data.supports_thinking = true;
- data.thinking_start_tag = "<think>";
- data.thinking_end_tag = "</think>";
data.preserved_tokens = {
"<|tool_calls_section_begin|>",
"<|tool_calls_section_end|>",
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
auto include_grammar = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;
+ const std::string SECTION_BEGIN = "<|tool_calls_section_begin|>";
+ const std::string SECTION_END = "<|tool_calls_section_end|>";
+ const std::string CALL_BEGIN = "<|tool_call_begin|>";
+ const std::string ARGS_BEGIN = "<|tool_call_argument_begin|>";
+ const std::string CALL_END = "<|tool_call_end|>";
+
+ const std::string THINK_START = "<think>";
+ const std::string THINK_END = "</think>";
+
+ data.thinking_start_tag = THINK_START;
+ data.thinking_end_tag = THINK_END;
+
auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
// Kimi K2 Thinking format:
// - Reasoning: <think>{reasoning}</think>
// <|tool_calls_section_end|>
// The ID format is: functions.<function_name>:<counter> where counter is 0, 1, 2, ...
- // Tool call markers
- const std::string SECTION_BEGIN = "<|tool_calls_section_begin|>";
- const std::string SECTION_END = "<|tool_calls_section_end|>";
- const std::string CALL_BEGIN = "<|tool_call_begin|>";
- const std::string ARGS_BEGIN = "<|tool_call_argument_begin|>";
- const std::string CALL_END = "<|tool_call_end|>";
-
- const std::string THINK_START = "<think>";
- const std::string THINK_END = "</think>";
-
+ // Tool call markers
auto end = p.end();
// Note: this model is CRAZY. It can diverge from its supposed tool calling pattern in so many ways it's not funny.
// Content only parser (no tools)
if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
- return reasoning + p.content(p.rest()) + end;
+ return wrap_for_generation_prompt(p, reasoning + p.content(p.rest()) + end,
+ inputs, THINK_START);
}
// Build tool call parsers for each available function
auto content_before_tools = p.content(p.until_one_of({ SECTION_BEGIN, CALL_BEGIN }));
- return reasoning + content_before_tools + tool_calls + end;
+ return wrap_for_generation_prompt(p, reasoning + content_before_tools + tool_calls + end,
+ inputs, THINK_START);
});
data.parser = parser.save();
// - Tool calls: <|tool_call_start|>[function_name(arg1="value1", arg2="value2")]<|tool_call_end|>
// Tool calls can appear multiple times (parallel tool calls)
static common_chat_params common_chat_params_init_lfm2(const common_chat_template & tmpl,
- const autoparser::templates_params & inputs) {
+ const autoparser::generation_params & inputs) {
common_chat_params data;
data.prompt = common_chat_template_direct_apply(tmpl, inputs);
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
auto include_grammar = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;
-
const std::string TOOL_CALL_START = "<|tool_call_start|>";
const std::string TOOL_CALL_END = "<|tool_call_end|>";
const std::string THINK_START = "<think>";
const std::string THINK_END = "</think>";
- auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
+ data.thinking_start_tag = THINK_START;
+ data.thinking_end_tag = THINK_END;
+
+ auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
auto end = p.end();
auto reasoning = p.eps();
}
if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
- return reasoning + p.content(p.rest()) + end;
+ return wrap_for_generation_prompt(p, reasoning + p.content(p.rest()) + end, inputs,
+ THINK_START);
}
auto tool_calls = p.rule("tool-calls",
auto content = p.content(p.until(TOOL_CALL_START));
- return reasoning + content + tool_calls + end;
+ return wrap_for_generation_prompt(p, reasoning + content + tool_calls + end, inputs,
+ THINK_START);
});
data.parser = parser.save();
static common_chat_params common_chat_params_init_gigachat_v3(
const common_chat_template & tmpl,
- const autoparser::templates_params & inputs) {
+ const autoparser::generation_params & inputs) {
common_chat_params data;
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
auto include_grammar = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;
- auto tool_call_start_prefix = "<|message_sep|>\n\nfunction call<|role_sep|>\n";
+ const auto *tool_call_start_prefix = "<|message_sep|>\n\nfunction call<|role_sep|>\n";
auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
+ auto ret = p.eps();
if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
// Build a choice of all available tools
auto tool_choice = p.choice();
auto tool_call = p.rule("tool-call", p.literal(tool_call_start_prefix) + tool_choice);
auto tool_calls = p.trigger_rule("tool-call-root", p.repeat(tool_call, /* min = */ min_calls, /* max = */ max_calls));
- return p.content(p.until("<|message_sep|>\n\n")) << tool_calls;
+ ret = p.content(p.until("<|message_sep|>\n\n")) << tool_calls;
+ } else {
+ // Content only parser
+ include_grammar = false;
+ ret = p.content(p.rest());
}
- // Content only parser
- include_grammar = false;
- return p.content(p.rest());
-
+ return wrap_for_generation_prompt(p, ret, inputs);
});
data.parser = parser.save();
return ctx;
}
+static std::optional<common_chat_params> try_specialized_template(
+ const common_chat_template & tmpl,
+ const std::string & src,
+ const autoparser::generation_params & params) {
+ // Ministral/Mistral Large 3 - uses special reasoning structure fixes, can't use autoparser
+ // Note: Mistral Small 3.2 uses [CALL_ID] which Ministral doesn't have, so we can distinguish them
+ if (src.find("[SYSTEM_PROMPT]") != std::string::npos && src.find("[TOOL_CALLS]") != std::string::npos &&
+ src.find("[ARGS]") != std::string::npos && src.find("[CALL_ID]") == std::string::npos) {
+ LOG_DBG("Using specialized template: Ministral/Magistral Large 3\n");
+ return common_chat_params_init_ministral_3(tmpl, params);
+ }
+
+ // GPT-OSS - has unique channel-based structure that needs dedicated handler
+ if (src.find("<|channel|>") != std::string::npos) {
+ LOG_DBG("Using specialized template: GPT-OSS\n");
+ return common_chat_params_init_gpt_oss(tmpl, params);
+ }
+
+ // Functionary v3.2 - uses recipient-based format with >>>recipient\n{content}
+ // Detection: template has ">>>all" for content and ">>>" prefix for tool calls
+ if (src.find(">>>all") != std::string::npos && src.find(">>>${recipient}") != std::string::npos) {
+ LOG_DBG("Using specialized template: Functionary v3.2\n");
+ return common_chat_params_init_functionary_v3_2(tmpl, params);
+ }
+
+ // Kimi K2 Thinking - uses unique tool call ID format: functions.<name>:<index>
+ // Detection: template has "<|tool_calls_section_begin|>" and "functions." prefix in tool call IDs
+ if (src.find("<|tool_calls_section_begin|>") != std::string::npos &&
+ src.find("<|tool_call_begin|>") != std::string::npos) {
+ LOG_DBG("Using specialized template: Kimi K2 Thinking\n");
+ return common_chat_params_init_kimi_k2(tmpl, params);
+ }
+
+ // LFM2 - uses <|tool_list_start|>/<|tool_list_end|> markers and <|tool_call_start|>[name(args)]<|tool_call_end|> format
+ // Detection: template has "<|tool_list_start|>" and "<|tool_list_end|>" markers
+ if (src.find("<|tool_list_start|>") != std::string::npos &&
+ src.find("<|tool_list_end|>") != std::string::npos) {
+ LOG_DBG("Using specialized template: LFM2\n");
+ return common_chat_params_init_lfm2(tmpl, params);
+ }
+
+ // GigaChatV3 format detection
+ if (src.find("<|role_sep|>") != std::string::npos &&
+ src.find("<|message_sep|>") != std::string::npos &&
+ src.find("<|function_call|>") == std::string::npos) {
+ LOG_DBG("Using specialized template: GigaChatV3\n");
+ return common_chat_params_init_gigachat_v3(tmpl, params);
+ }
+
+ return std::nullopt;
+}
+
static common_chat_params common_chat_templates_apply_jinja(const struct common_chat_templates * tmpls,
const struct common_chat_templates_inputs & inputs) {
- autoparser::templates_params params;
+ autoparser::generation_params params;
params.tools = common_chat_tools_to_json_oaicompat(inputs.tools);
- const auto & tmpl = params.tools.is_array() && tmpls->template_tool_use
- ? *tmpls->template_tool_use
- : *tmpls->template_default;
- const auto & src = tmpl.source();
- const auto & caps = tmpl.original_caps();
- params.messages = render_message_to_json(inputs.messages, tmpl.original_caps());
- params.add_generation_prompt = inputs.add_generation_prompt;
- params.tool_choice = inputs.tool_choice;
+ const auto & tmpl =
+ params.tools.is_array() && tmpls->template_tool_use ? *tmpls->template_tool_use : *tmpls->template_default;
+ const auto & src = tmpl.source();
+ const auto & caps = tmpl.original_caps();
+ params.messages = render_message_to_json(inputs.messages, tmpl.original_caps());
+ params.tool_choice = inputs.tool_choice;
params.reasoning_format = inputs.reasoning_format;
- params.enable_thinking = inputs.enable_thinking;
- params.grammar = inputs.grammar;
- params.now = inputs.now;
- params.add_bos = tmpls->add_bos;
- params.add_eos = tmpls->add_eos;
+ params.enable_thinking = inputs.enable_thinking;
+ params.grammar = inputs.grammar;
+ params.now = inputs.now;
+ params.add_bos = tmpls->add_bos;
+ params.add_eos = tmpls->add_eos;
if (src.find("<|channel|>") == std::string::npos) {
// map developer to system for all models except for GPT-OSS
workaround::func_args_not_string(params.messages);
}
+ params.add_generation_prompt = false;
+ std::string no_gen_prompt = common_chat_template_direct_apply(tmpl, params);
+ params.add_generation_prompt = true;
+ std::string gen_prompt = common_chat_template_direct_apply(tmpl, params);
+ auto diff = calculate_diff_split(no_gen_prompt, gen_prompt);
+ params.generation_prompt = diff.right;
+
+ params.add_generation_prompt = inputs.add_generation_prompt;
+
params.extra_context = common_chat_extra_context();
for (auto el : inputs.chat_template_kwargs) {
params.extra_context[el.first] = json::parse(el.second);
params.json_schema = json::parse(inputs.json_schema);
}
- // if (inputs.parallel_tool_calls && !tmpl.original_caps().supports_parallel_tool_calls) {
- // LOG_DBG("Disabling parallel_tool_calls because the template does not support it\n");
- // params.parallel_tool_calls = false;
- // } else {
params.parallel_tool_calls = inputs.parallel_tool_calls;
- //}
if (params.tools.is_array()) {
if (params.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && !params.grammar.empty()) {
params_copy.reasoning_format = COMMON_REASONING_FORMAT_NONE;
data.prompt = common_chat_template_direct_apply(tmpl, params_copy);
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
- auto parser = build_chat_peg_parser([](common_chat_peg_builder &p) {
- return p.content(p.rest());
+ data.generation_prompt = params.generation_prompt;
+ auto parser = build_chat_peg_parser([¶ms](common_chat_peg_builder &p) {
+ return wrap_for_generation_prompt(p, p.content(p.rest()), params);
});
data.parser = parser.save();
return data;
}
- // Ministral/Mistral Large 3 - uses special reasoning structure fixes, can't use autoparser
- // Note: Mistral Small 3.2 uses [CALL_ID] which Ministral doesn't have, so we can distinguish them
- if (src.find("[SYSTEM_PROMPT]") != std::string::npos && src.find("[TOOL_CALLS]") != std::string::npos &&
- src.find("[ARGS]") != std::string::npos && src.find("[CALL_ID]") == std::string::npos) {
- LOG_DBG("Using specialized template: Ministral/Magistral Large 3\n");
- return common_chat_params_init_ministral_3(tmpl, params);
- }
-
- // GPT-OSS - has unique channel-based structure that needs dedicated handler
- if (src.find("<|channel|>") != std::string::npos) {
- LOG_DBG("Using specialized template: GPT-OSS\n");
- return common_chat_params_init_gpt_oss(tmpl, params);
- }
-
- // Functionary v3.2 - uses recipient-based format with >>>recipient\n{content}
- // Detection: template has ">>>all" for content and ">>>" prefix for tool calls
- if (src.find(">>>all") != std::string::npos && src.find(">>>${recipient}") != std::string::npos) {
- LOG_DBG("Using specialized template: Functionary v3.2\n");
- return common_chat_params_init_functionary_v3_2(tmpl, params);
- }
-
- // Kimi K2 Thinking - uses unique tool call ID format: functions.<name>:<index>
- // Detection: template has "<|tool_calls_section_begin|>" and "functions." prefix in tool call IDs
- if (src.find("<|tool_calls_section_begin|>") != std::string::npos &&
- src.find("<|tool_call_begin|>") != std::string::npos) {
- LOG_DBG("Using specialized template: Kimi K2 Thinking\n");
- return common_chat_params_init_kimi_k2(tmpl, params);
- }
-
- // LFM2 - uses <|tool_list_start|>/<|tool_list_end|> markers and <|tool_call_start|>[name(args)]<|tool_call_end|> format
- // Detection: template has "<|tool_list_start|>" and "<|tool_list_end|>" markers
- if (src.find("<|tool_list_start|>") != std::string::npos &&
- src.find("<|tool_list_end|>") != std::string::npos) {
- LOG_DBG("Using specialized template: LFM2\n");
- return common_chat_params_init_lfm2(tmpl, params);
- }
-
- // GigaChatV3 format detection
- if (src.find("<|role_sep|>") != std::string::npos &&
- src.find("<|message_sep|>") != std::string::npos &&
- src.find("<|function_call|>") == std::string::npos
- ) {
- LOG_DBG("Using specialized template: GigaChatV3\n");
- return common_chat_params_init_gigachat_v3(tmpl, params);
+ if (auto result = try_specialized_template(tmpl, src, params)) {
+ result->generation_prompt = params.generation_prompt;
+ return *result;
}
try {
- LOG_DBG("Using differential autoparser\n");
+ LOG_DBG("%s: using differential autoparser\n", __func__);
struct autoparser::autoparser autoparser;
autoparser.analyze_template(tmpl);
auto auto_params = autoparser::peg_generator::generate_parser(tmpl, params, autoparser);
if (auto_params.supports_thinking) {
auto_params.thinking_start_tag = autoparser.reasoning.start;
auto_params.thinking_end_tag = autoparser.reasoning.end;
- // FORCED_OPEN and FORCED_CLOSED both put <think> in the generation prompt
- // (FORCED_CLOSED forces empty <think></think> when thinking is disabled,
- // but forces <think> open when thinking is enabled)
- auto_params.thinking_forced_open =
- autoparser.reasoning.mode == autoparser::reasoning_mode::FORCED_OPEN ||
- autoparser.reasoning.mode == autoparser::reasoning_mode::FORCED_CLOSED;
}
+ auto_params.generation_prompt = params.generation_prompt;
+ common_peg_arena arena;
+ arena.load(auto_params.parser);
+ LOG_DBG("%s: generated parser:\n%s\n\nparser generation prompt: %s\n", __func__, arena.dump(arena.root()).c_str(), auto_params.generation_prompt.c_str());
return auto_params;
} catch (const std::exception & e) {
throw std::invalid_argument(std::string("Unable to generate parser for this template. Automatic parser generation failed: ") + e.what());
LOG_DBG("No parser definition detected, assuming pure content parser.");
}
- LOG_DBG("Parsing PEG input with format %s: %s\n", common_chat_format_name(params.format), input.c_str());
+ const std::string effective_input = params.generation_prompt.empty()
+ ? input
+ : params.generation_prompt + input;
+
+ LOG_DBG("Parsing PEG input with format %s: %s\n", common_chat_format_name(params.format), effective_input.c_str());
common_peg_parse_flags flags = COMMON_PEG_PARSE_FLAG_LENIENT;
if (params.debug) {
flags |= COMMON_PEG_PARSE_FLAG_DEBUG;
}
- common_peg_parse_context ctx(input, flags);
+ common_peg_parse_context ctx(effective_input, flags);
auto result = parser.parse(ctx);
if (result.fail()) {
struct common_chat_templates;
namespace autoparser {
-struct templates_params;
+struct generation_params;
} // namespace autoparser
struct common_chat_tool_call {
std::string prompt;
std::string grammar;
bool grammar_lazy = false;
- bool thinking_forced_open = false;
+ std::string generation_prompt;
bool supports_thinking = false;
std::string thinking_start_tag; // e.g., "<think>"
std::string thinking_end_tag; // e.g., "</think>"
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE; // TODO: refactor this to "bool parse_reasoning"
// Whether reasoning_content should be inlined in the content (e.g. for reasoning_format=deepseek in stream mode)
bool reasoning_in_content = false;
- bool thinking_forced_open = false;
+ std::string generation_prompt;
bool parse_tool_calls = true;
bool debug = false; // Enable debug output for PEG parser
common_peg_arena parser = {};
common_chat_parser_params() = default;
common_chat_parser_params(const common_chat_params & chat_params) {
- format = chat_params.format;
- thinking_forced_open = chat_params.thinking_forced_open;
+ format = chat_params.format;
+ generation_prompt = chat_params.generation_prompt;
}
};
std::string common_chat_template_direct_apply(
const common_chat_template & tmpl,
- const autoparser::templates_params & inputs,
+ const autoparser::generation_params & inputs,
const std::optional<json> & messages_override = std::nullopt,
const std::optional<json> & tools_override = std::nullopt,
const std::optional<json> & additional_context = std::nullopt);
#pragma once
#include "ggml-opt.h"
+#include "ggml.h"
#include "llama-cpp.h"
#include <set>
#include <sstream>
#include <string>
#include <string_view>
+#include <variant>
#include <vector>
#include <map>
COMMON_SPECULATIVE_TYPE_COUNT // number of types, unknown type
};
+// Grammar type enumeration
+enum common_grammar_type {
+ COMMON_GRAMMAR_TYPE_NONE, // no grammar set
+ COMMON_GRAMMAR_TYPE_USER, // user-provided GBNF (--grammar / "grammar" API field)
+ COMMON_GRAMMAR_TYPE_OUTPUT_FORMAT, // auto-generated from JSON schema (--json-schema / "json_schema" API field)
+ COMMON_GRAMMAR_TYPE_TOOL_CALLS, // auto-generated by chat template parser for function calling
+};
+
+// Grammar variant struct with type and grammar string
+struct common_grammar {
+ common_grammar_type type = COMMON_GRAMMAR_TYPE_NONE;
+ std::string grammar;
+
+ // Default constructor - no grammar
+ common_grammar() = default;
+
+ // Constructor with type and grammar string
+ common_grammar(common_grammar_type t, std::string g) : type(t), grammar(std::move(g)) {
+ GGML_ASSERT(type != COMMON_GRAMMAR_TYPE_NONE || !grammar.empty());
+ }
+
+ // Check if a grammar is set
+ bool empty() const { return type == COMMON_GRAMMAR_TYPE_NONE || grammar.empty(); }
+};
+
+// Returns the raw grammar string, or empty string if no grammar is set.
+inline const std::string & common_grammar_value(const common_grammar & g) {
+ return g.grammar;
+}
+
+// Returns true when the generation_prompt should be prefilled into the grammar sampler.
+// Only output-format and tool-call grammars need prefill; user-supplied grammars must not be prefilled.
+inline bool common_grammar_needs_prefill(const common_grammar & g) {
+ return g.type == COMMON_GRAMMAR_TYPE_OUTPUT_FORMAT
+ || g.type == COMMON_GRAMMAR_TYPE_TOOL_CALLS;
+}
+
// sampling parameters
struct common_params_sampling {
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
COMMON_SAMPLER_TYPE_TEMPERATURE,
};
- std::string grammar; // optional BNF-like grammar to constrain sampling
+ common_grammar grammar; // optional grammar constraint (user / output-format / tool-calls)
bool grammar_lazy = false;
std::vector<common_grammar_trigger> grammar_triggers; // optional triggers (for lazy grammars)
std::set<llama_token> preserved_tokens;
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
+ // The assistant generation prompt already prefilled into the prompt.
+ // Fed to the grammar sampler (to advance past pre-existing tokens) and used
+ // to determine the reasoning budget sampler's initial state.
+ // Only applied when the grammar is of output-format or tool-calls type.
+ std::string generation_prompt;
+
// reasoning budget sampler parameters
// these are populated by the server/CLI based on chat template params
int32_t reasoning_budget_tokens = -1; // -1 = disabled, >= 0 = token budget
- bool reasoning_budget_activate_immediately = false;
std::vector<llama_token> reasoning_budget_start; // start tag token sequence
std::vector<llama_token> reasoning_budget_end; // end tag token sequence
std::vector<llama_token> reasoning_budget_forced; // forced sequence (message + end tag)
ctx->force_pos = 0;
}
+// forward declaration for use in clone
+static struct llama_sampler * common_reasoning_budget_init_state(
+ const struct llama_vocab * vocab, const std::vector<llama_token> & start_tokens,
+ const std::vector<llama_token> & end_tokens, const std::vector<llama_token> & forced_tokens,
+ int32_t budget, common_reasoning_budget_state initial_state);
+
static struct llama_sampler * common_reasoning_budget_clone(const struct llama_sampler * smpl) {
const auto * ctx = (const common_reasoning_budget_ctx *) smpl->ctx;
- return common_reasoning_budget_init(
+ return common_reasoning_budget_init_state(
ctx->vocab,
ctx->start_matcher.tokens,
ctx->end_matcher.tokens,
/* .backend_set_input = */ nullptr,
};
-struct llama_sampler * common_reasoning_budget_init(
- const struct llama_vocab * vocab,
- const std::vector<llama_token> & start_tokens,
- const std::vector<llama_token> & end_tokens,
- const std::vector<llama_token> & forced_tokens,
- int32_t budget,
- common_reasoning_budget_state initial_state) {
+static struct llama_sampler * common_reasoning_budget_init_state(
+ const struct llama_vocab * vocab,
+ const std::vector<llama_token> & start_tokens,
+ const std::vector<llama_token> & end_tokens,
+ const std::vector<llama_token> & forced_tokens,
+ int32_t budget,
+ common_reasoning_budget_state initial_state) {
// promote COUNTING with budget <= 0 to FORCING
if (initial_state == REASONING_BUDGET_COUNTING && budget <= 0) {
initial_state = REASONING_BUDGET_FORCING;
}
);
}
+
+struct llama_sampler * common_reasoning_budget_init(
+ const struct llama_vocab * vocab,
+ const std::vector<llama_token> & start_tokens,
+ const std::vector<llama_token> & end_tokens,
+ const std::vector<llama_token> & forced_tokens,
+ int32_t budget,
+ const std::vector<llama_token> & prefill_tokens) {
+ // Determine initial state from prefill: COUNTING if the prefill begins with
+ // the start sequence but does not also contain the end sequence after it.
+ common_reasoning_budget_state initial_state = REASONING_BUDGET_IDLE;
+ if (!prefill_tokens.empty() && !start_tokens.empty() &&
+ prefill_tokens.size() >= start_tokens.size() &&
+ std::equal(start_tokens.begin(), start_tokens.end(), prefill_tokens.begin())) {
+ initial_state = REASONING_BUDGET_COUNTING;
+ // If the end sequence also follows the start in the prefill, reasoning
+ // was opened and immediately closed — stay IDLE.
+ if (!end_tokens.empty() &&
+ prefill_tokens.size() >= start_tokens.size() + end_tokens.size()) {
+ auto end_start = prefill_tokens.end() - (ptrdiff_t) end_tokens.size();
+ if (end_start >= prefill_tokens.begin() + (ptrdiff_t) start_tokens.size() &&
+ std::equal(end_tokens.begin(), end_tokens.end(), end_start)) {
+ initial_state = REASONING_BUDGET_IDLE;
+ }
+ }
+ }
+ return common_reasoning_budget_init_state(vocab, start_tokens, end_tokens, forced_tokens, budget, initial_state);
+}
+
+struct llama_sampler * common_reasoning_budget_init(
+ const struct llama_vocab * vocab,
+ const std::vector<llama_token> & start_tokens,
+ const std::vector<llama_token> & end_tokens,
+ const std::vector<llama_token> & forced_tokens,
+ int32_t budget,
+ common_reasoning_budget_state initial_state) {
+ return common_reasoning_budget_init_state(vocab, start_tokens, end_tokens, forced_tokens, budget, initial_state);
+}
// DONE: passthrough forever
//
// Parameters:
-// vocab - vocabulary (used for UTF-8 boundary detection; can be nullptr)
-// start_tokens - token sequence that activates counting
-// end_tokens - token sequence for natural deactivation
-// forced_tokens - token sequence forced when budget expires
-// budget - max tokens allowed in the reasoning block
-// initial_state - initial state of the sampler (e.g. IDLE or COUNTING)
-// note: COUNTING with budget <= 0 is promoted to FORCING
+// vocab - vocabulary (used for UTF-8 boundary detection; can be nullptr)
+// start_tokens - token sequence that activates counting
+// end_tokens - token sequence for natural deactivation
+// forced_tokens - token sequence forced when budget expires
+// budget - max tokens allowed in the reasoning block
+// prefill_tokens - tokens already present in the prompt (generation prompt);
+// used to determine the initial state: COUNTING if they begin
+// with start_tokens (but don't also end with end_tokens),
+// IDLE otherwise. COUNTING with budget <= 0 is promoted to FORCING.
//
+struct llama_sampler * common_reasoning_budget_init(
+ const struct llama_vocab * vocab,
+ const std::vector<llama_token> & start_tokens,
+ const std::vector<llama_token> & end_tokens,
+ const std::vector<llama_token> & forced_tokens,
+ int32_t budget,
+ const std::vector<llama_token> & prefill_tokens = {});
+
+// Variant that takes an explicit initial state (used by tests and clone).
+// COUNTING with budget <= 0 is promoted to FORCING.
struct llama_sampler * common_reasoning_budget_init(
const struct llama_vocab * vocab,
const std::vector<llama_token> & start_tokens,
#include "sampling.h"
#include "common.h"
+#include "ggml.h"
#include "log.h"
#include "reasoning-budget.h"
#include <algorithm>
+#include <cctype>
#include <cmath>
#include <cstring>
#include <unordered_map>
+#include <vector>
// the ring buffer works similarly to std::deque, but with a fixed capacity
// TODO: deduplicate with llama-impl.h
std::vector<llama_sampler *> samplers;
- if (params.grammar.compare(0, 11, "%llguidance") == 0) {
+ const std::string & grammar_str = common_grammar_value(params.grammar);
+ if (grammar_str.compare(0, 11, "%llguidance") == 0) {
#ifdef LLAMA_USE_LLGUIDANCE
- grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str());
+ grmr = llama_sampler_init_llg(vocab, "lark", grammar_str.c_str());
#else
GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
#endif // LLAMA_USE_LLGUIDANCE
trigger_patterns_c.push_back(regex.c_str());
}
- if (!params.grammar.empty()) {
+ if (!grammar_str.empty()) {
if (params.grammar_lazy) {
- grmr = llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
+ grmr = llama_sampler_init_grammar_lazy_patterns(vocab, grammar_str.c_str(), "root",
trigger_patterns_c.data(), trigger_patterns_c.size(),
trigger_tokens.data(), trigger_tokens.size());
} else {
- grmr = llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
+ grmr = llama_sampler_init_grammar(vocab, grammar_str.c_str(), "root");
}
}
}
+ // Feed generation prompt tokens to the grammar sampler so it advances past
+ // tokens the template already placed in the prompt.
+ // Only applies to output-format and tool-call grammars; user-supplied grammars must not be prefilled.
+ std::vector<llama_token> prefill_tokens;
+ if (!params.generation_prompt.empty() && common_grammar_needs_prefill(params.grammar)) {
+ GGML_ASSERT(vocab != nullptr);
+ prefill_tokens = common_tokenize(vocab, params.generation_prompt, false, true);
+ if (!prefill_tokens.empty()) {
+ std::string first_token = common_token_to_piece(vocab, prefill_tokens[0], true);
+ if (std::isspace(first_token[0]) && !std::isspace(params.generation_prompt[0])) {
+ // Some tokenizers will add a space before the first special token, need to remove
+ prefill_tokens = std::vector<llama_token>(prefill_tokens.begin() + 1, prefill_tokens.end());
+ }
+ }
+
+ if (grmr) {
+ try {
+ for (const auto & token : prefill_tokens) {
+ llama_sampler_accept(grmr, token);
+ LOG_DBG("%s: accepted prefill token (%d)\n", __func__, token);
+ }
+ } catch (std::exception &e) {
+ LOG_ERR("%s: error initializing grammar sampler for grammar:\n%s\n\nGeneration prompt:\n'%s'\n", __func__,
+ common_grammar_value(params.grammar).c_str(), params.generation_prompt.c_str());
+ throw e;
+ }
+ }
+ }
+
// reasoning budget sampler — added first so it can force tokens before other samplers
if (params.reasoning_budget_tokens >= 0 && !params.reasoning_budget_forced.empty()) {
samplers.push_back(common_reasoning_budget_init(
params.reasoning_budget_end,
params.reasoning_budget_forced,
params.reasoning_budget_tokens,
- params.reasoning_budget_activate_immediately ? REASONING_BUDGET_COUNTING : REASONING_BUDGET_IDLE));
+ prefill_tokens));
}
if (params.has_logit_bias()) {
**Analysis + Parser Building in Two Steps**:
1. `autoparser::autoparser tmpl_analysis(tmpl)` — runs all differential comparisons and populates the analysis structs
-2. `autoparser::peg_generator::generate_parser(tmpl, params, tmpl_analysis)` — uses the analysis to build a PEG parser and optional GBNF grammar
+2. `autoparser::peg_generator::generate_parser(tmpl, generation_params, tmpl_analysis)` — uses the analysis to build a PEG parser and optional GBNF grammar
## Data Structures
### `analyze_tools` and its sub-structs
-- [common/chat-auto-parser.h:176-194](common/chat-auto-parser.h#L176-L194) — `tool_format_analysis`: `mode` enum, `section_start/end`, `per_call_start/end`, JSON field names (`function_field`, `name_field`, `args_field`, `id_field`, `gen_id_field`), and format flags (`fun_name_is_key`, `tools_array_wrapped`, `uses_python_dicts`)
+- [common/chat-auto-parser.h:176-194](common/chat-auto-parser.h#L176-L194) — `tool_format_analysis`: `mode` enum, `section_start/end`, `per_call_start/end`, JSON field names (`function_field`, `name_field`, `args_field`, `id_field`, `gen_id_field`), and format flags (`fun_name_is_key`, `tools_array_wrapped`)
- [common/chat-auto-parser.h:196-200](common/chat-auto-parser.h#L196-L200) — `tool_function_analysis`: `name_prefix`, `name_suffix`, `close` markers around function names
- [common/chat-auto-parser.h:202-210](common/chat-auto-parser.h#L202-L210) — `tool_arguments_analysis`: `start/end` container markers, `name_prefix/suffix`, `value_prefix/suffix`, `separator`
- [common/chat-auto-parser.h:212-217](common/chat-auto-parser.h#L212-L217) — `tool_id_analysis`: `pos` enum, `prefix`/`suffix` markers around call ID values
| Value | Description |
|-----------------|-----------------------------------------------------------------------------------|
| `NONE` | No reasoning markers detected |
-| `TAG_BASED` | Standard tag-based: `<think>...</think>` |
-| `DELIMITER` | Delimiter-based: reasoning ends at a delimiter (e.g., `[BEGIN FINAL RESPONSE]`) |
-| `FORCED_OPEN` | Template ends with open reasoning tag when `enable_thinking=true` |
-| `FORCED_CLOSED` | `enable_thinking=false` emits both tags; `enable_thinking=true` emits only start |
+| `TAG_BASED` | Tag-based: `<think>...</think>` (start can be empty for delimiter-style formats) |
| `TOOLS_ONLY` | Reasoning only appears in tool call responses, not plain content |
+**Generation Prompt & Reasoning Prefill**: Computed in `common_chat_templates_apply_jinja` before invoking either the specialized handlers or the auto-parser, by rendering the template twice — once with `add_generation_prompt=false` and once with `add_generation_prompt=true` — and storing the diff suffix as `generation_params::generation_prompt`. This string is propagated into `common_chat_params::generation_prompt` and `common_chat_parser_params::generation_prompt`.
+
+The generation prompt is prepended to model output before PEG parsing via `wrap_for_generation_prompt()`. The portion *before* the reasoning start marker (if any) is prepended as a literal to ensure any boilerplate added by the template is consumed. The full string is also fed to the grammar sampler via `llama_sampler_accept` (stored in `common_params_sampling::grammar_prefill`), advancing the grammar past tokens already in the prompt. It is used to determine the reasoning budget sampler's initial state — COUNTING if the prefill tokens begin with the reasoning start sequence (but don't also contain the end sequence), IDLE otherwise.
+
+**`grammar_prefill`** (`common_params_sampling`): The generation prompt string tokenized and accepted by the grammar sampler at init time. Only applied when `grammar_external` is false (i.e., the grammar was not set explicitly by the user).
+
+Three outcomes for reasoning-prefill handling (in `generate_parser()`):
+
+1. **Start+end in generation prompt** (e.g. `<think></think>\n`): the parser sees reasoning as opened and immediately closed; whitespace-only reasoning content is discarded.
+2. **Only start in generation prompt** (e.g. `<think>\n`): the parser sees reasoning as already open.
+3. **Start marker present but not at the end** (e.g. Apriel's `<|begin_assistant|>` followed by boilerplate): the marker is a template artifact; the start literal is cleared so reasoning uses delimiter-style (end-only). For templates that ignore `add_generation_prompt` (empty diff), the rendered `data.prompt` is used as fallback — but only for non-TOOLS_ONLY modes, since in TOOLS_ONLY the start tag is model-generated and may appear in prior conversation turns.
+
**`content_mode`**: How the template wraps assistant content.
| Value | Description |
- Searches `diff.right` (output with reasoning) for the reasoning content needle
- Uses PEG parsers to find surrounding markers:
- - If both pre/post markers found in `diff.right` → `TAG_BASED` (both tags visible in diff = no forced close)
- - If both found but post marker only in the full output B → `FORCED_CLOSED`
- - If only post marker found → `DELIMITER`
+ - If both pre/post markers found in `diff.right` → `TAG_BASED`
+ - If both found but post marker only in the full output B → `TAG_BASED` (template forces markers; handled via prefill)
+ - If only post marker found → `TAG_BASED` (delimiter-style, empty start)
- Sets `reasoning.start` and `reasoning.end`
**R2 — `compare_thinking_enabled()`**: Compares `enable_thinking=false` vs `true` with a generation prompt.
-- Detects `FORCED_OPEN`: `enable_thinking=true` adds a non-empty marker at the end of the prompt (where model will start generating) — sets `reasoning.start`, mode = `FORCED_OPEN`
-- Detects `FORCED_CLOSED`: `enable_thinking=false` produces both start+end markers; `enable_thinking=true` produces only start marker
-- Handles the reverse case: if both start and end are still empty, looks for a single-segment diff on each side to extract both markers
+- Detects template-added reasoning markers: `enable_thinking=true` appends a non-empty marker → sets `reasoning.start`, mode = `TAG_BASED`
+- Handles the reverse case (`enable_thinking=false` appends the marker instead): extracts both start (from the preceding segment) and end markers; mode = `TAG_BASED`
+- The reasoning prefill (markers added by the template) is later extracted in `common_chat_templates_apply_jinja` and prepended to model output before parsing
**R3 — `compare_reasoning_scope()`**: Compares assistant message with reasoning+text-content vs reasoning+tool-calls.
A workaround array in `common/chat-diff-analyzer.cpp` applies post-hoc patches after analysis. Each workaround is a lambda that inspects the template source and overrides analysis results. Current workarounds:
-1. **Old Qwen/DeepSeek thinking templates** — source contains `content.split('</think>')`: sets `reasoning.mode = FORCED_OPEN` with `<think>`/`</think>` markers if no reasoning was detected
+1. **Old Qwen/DeepSeek thinking templates** — source contains `content.split('</think>')` but not `<SPECIAL_12>`: sets `reasoning.mode = TAG_BASED` with `<think>`/`</think>` markers if no reasoning was detected
2. **Granite 3.3** — source contains specific "Write your thoughts" text: forces `TAG_BASED` reasoning with `<think>`/`</think>` and `WRAPPED_WITH_REASONING` content with `<response>`/`</response>`
3. **Cohere Command R+** — source contains `<|CHATBOT_TOKEN|>`: sets `ALWAYS_WRAPPED` content mode if no content start is already set
4. **Functionary 3.1** — source contains `set has_code_interpreter`: forces `PLAIN` content, specific `per_call_start/end`, clears preserved tokens to only keep Functionary-specific markers
#### Reasoning Parser (`analyze_reasoning::build_parser`)
-| Mode | Parser |
-|-----------------------------------|---------------------------------------------------------------------|
-| Not extracting reasoning | `eps()` |
-| `FORCED_OPEN` or `FORCED_CLOSED` | `reasoning(until(end)) + end` — opening tag was in the prompt |
-| `TAG_BASED` or `TOOLS_ONLY` | `optional(start + reasoning(until(end)) + end)` |
-| `DELIMITER` | `optional(reasoning(until(end)) + end)` — no start marker |
+| Mode | Parser |
+|-----------------------------------------------|---------------------------------------------------------------------------|
+| Not extracting reasoning | `eps()` |
+| `TAG_BASED` or `TOOLS_ONLY` (non-empty start) | `optional(start + reasoning(until(end)) + end + space())` |
+| `TAG_BASED` or `TOOLS_ONLY` (empty start) | `optional(reasoning(until(end)) + end + space())` — delimiter-style |
+
+Note: The start marker may be empty either because the analyzer detected delimiter-style reasoning, or because `generate_parser()` cleared a template artifact start marker (see Generation Prompt & Reasoning Prefill above). Whitespace-only reasoning content (e.g. from a `<think></think>` prefill) is discarded by the mapper.
#### Content Parser (`analyze_content::build_parser`)
reasoning + optional(content(until(trigger_marker))) + tool_calls + end()
```
-### Python Dict Format
-
-When `format.uses_python_dicts` is true (detected when single-quoted strings appear in JSON argument context), `build_parser()` pre-registers a `json-string` rule that accepts both single-quoted and double-quoted strings. This is done before any `p.json()` call so all JSON parsing inherits the flexible rule.
+Each returned parser is wrapped by `wrap_for_generation_prompt()`, which prepends a literal for any boilerplate prefix of the generation prompt (the portion before the reasoning start marker).
## Mapper
- **Buffered arguments**: Before `tool_name` is known, argument text goes to `args_buffer`; once the name is set, the buffer is flushed to `current_tool->arguments`
- **`args_target()`**: Returns a reference to whichever destination is currently active (buffer or tool args), eliminating branching
- **`closing_quote_pending`**: Tracks whether a closing `"` needs to be appended when a string argument value is finalized (for schema-declared string types in tagged format)
-- **Quote normalization**: Python-style quotes (`'key': 'value'`) are converted to JSON (`"key": "value"`)
+- **Whitespace-only reasoning**: Reasoning content that consists entirely of whitespace (e.g. from a `<think></think>` prefill) is cleared so the message shows no reasoning
- **Brace auto-closing**: At tool close, unclosed `{` braces are closed automatically
## Files
-| File | Purpose |
-|-------------------------------------------|----------------------------------------------------------------------|
-| `common/chat-auto-parser.h` | All analysis structs, enums, `autoparser`, `peg_generator`, `templates_params` |
-| `common/chat-auto-parser-generator.cpp` | Parser generator: `generate_parser()` and `build_parser()` methods |
-| `common/chat-diff-analyzer.cpp` | Differential analysis implementation and workarounds |
-| `common/chat-auto-parser-helpers.h/cpp` | `calculate_diff_split()`, `segmentize_markers()`, |
-| | `compare_variants()`, string helpers |
-| `common/chat-peg-parser.h/cpp` | `common_chat_peg_builder`, `common_chat_peg_mapper`, and helpers |
-| `common/chat.cpp` | Entry point: `common_chat_templates_apply_jinja()` |
-| `tools/parser/debug-template-parser.cpp` | Debug tool for template analysis |
-| `tools/parser/template-analysis.cpp` | Template analysis tool |
+| File | Purpose |
+|-------------------------------------------|---------------------------------------------------------------------------------|
+| `common/chat-auto-parser.h` | All analysis structs, enums, `autoparser`, `peg_generator`, `generation_params` |
+| `common/chat-auto-parser-generator.cpp` | Parser generator: `generate_parser()` and `build_parser()` methods |
+| `common/chat-diff-analyzer.cpp` | Differential analysis implementation and workarounds |
+| `common/chat-auto-parser-helpers.h/cpp` | `calculate_diff_split()`, `segmentize_markers()`, `compare_variants()`, |
+| | `wrap_for_generation_prompt()`, string helpers |
+| `common/chat-peg-parser.h/cpp` | `common_chat_peg_builder`, `common_chat_peg_mapper`, and helpers |
+| `common/chat.cpp` | Entry point: `common_chat_templates_apply_jinja()` |
+| `tools/parser/debug-template-parser.cpp` | Debug tool for template analysis |
+| `tools/parser/template-analysis.cpp` | Template analysis tool |
## Testing & Debugging
## Edge Cases and Quirks
-1. **Forced Thinking**: When `enable_thinking=true` and the model prompt ends with an open reasoning tag (e.g., `<think>`), the parser enters forced thinking mode and immediately expects reasoning content without waiting for a start marker.
+1. **Generation Prompt & Reasoning Prefill**: The generation prompt is extracted by diffing `add_generation_prompt=false` vs `true` in `common_chat_templates_apply_jinja`, so it contains exactly what the template appends — avoiding false positives from prior conversation turns.
2. **Per-Call vs Per-Section Markers**: Some templates wrap each tool call individually (`per_call_start/end`); others wrap the entire section (`section_start/end`). T2 (`check_per_call_markers()`) disambiguates by checking if the second call in a two-call output starts with the section marker.
-3. **Python Dict Format**: The Seed template family uses single-quoted JSON (`'key': 'value'`). The `uses_python_dicts` flag causes the PEG builder to register a flexible `json-string` rule accepting both quote styles before any JSON rules are built.
-4. **Tag Boundary Fixing**: `calculate_diff_split()` iteratively adjusts prefix/suffix boundaries to avoid splitting `<tag>` or `[marker]` tokens, ensuring clean extraction.
-5. **Call ID Side Effects**: When a call ID is detected, `per_call_end` may have been incorrectly set to include the call ID suffix. T7 clears `per_call_end` in this case.
-6. **Tool Analysis Gating**: `analyze_tools` is only constructed (and all tool analysis phases run) when `jinja_caps.supports_tool_calls` is true. Within tool analysis, `check_per_call_markers()` (T2) only runs if `jinja_caps.supports_parallel_tool_calls`.
-7. **`analyze_arguments()` Gating**: Within tool analysis, A1 and A2 (argument name/value marker extraction) only run for `TAG_WITH_TAGGED` format. `extract_argument_separator()` and `extract_args_markers()` run for all non-`JSON_NATIVE` formats.
+3. **Tag Boundary Fixing**: `calculate_diff_split()` iteratively adjusts prefix/suffix boundaries to avoid splitting `<tag>` or `[marker]` tokens, ensuring clean extraction.
+4. **Call ID Side Effects**: When a call ID is detected, `per_call_end` may have been incorrectly set to include the call ID suffix. T7 clears `per_call_end` in this case.
+5. **Tool Analysis Gating**: `analyze_tools` is only constructed (and all tool analysis phases run) when `jinja_caps.supports_tool_calls` is true. Within tool analysis, `check_per_call_markers()` (T2) only runs if `jinja_caps.supports_parallel_tool_calls`.
+6. **`analyze_arguments()` Gating**: Within tool analysis, A1 and A2 (argument name/value marker extraction) only run for `TAG_WITH_TAGGED` format. `extract_argument_separator()` and `extract_args_markers()` run for all non-`JSON_NATIVE` formats.
+7. **Undetected Tool Format**: If `analyze_tools` concludes tool calling is supported but cannot determine the format, `build_parser()` logs an error and returns `eps()` (graceful degradation) rather than aborting.
{%- set available_tool_string = '' -%}
{%- set add_tool_id = true -%}
{%- set add_thoughts = true -%} {# whether to include <thinking> reasoning blocks #}
-{%- set add_generation_prompt = true -%} {# whether to emit reasoning starter before assistant response #}
{# Optional token placeholders (safe defaults) #}
{%- set bos_token = bos_token or '' -%}
{%- set eos_token = eos_token or '' -%}
{%- set ns.is_tool = false -%}
{%- for tool in message['tool_calls']-%}
{%- if not ns.is_first -%}
- {{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}
+ {{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] | tojson + '\n' + '```' + '<|tool▁call▁end|>'}}
{%- set ns.is_first = true -%}
{%- else -%}
- {{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}
+ {{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] | tojson + '\n' + '```' + '<|tool▁call▁end|>'}}
{%- endif -%}
{%- endfor -%}
{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}
{%- set ns.is_last_user = true -%}{{'<|User|>' + message['content']}}
{%- endif -%}
{%- if message['role'] == 'assistant' and message['tool_calls'] -%}
- {%- if ns.is_last_user -%}{{'<|Assistant|></think>'}}
+ {%- if ns.is_last_user -%}{{'<|Assistant|><think></think>'}}
{%- endif -%}
{%- set ns.is_last_user = false -%}
{%- set ns.is_first = false -%}
{%- set ns.is_tool = false -%}
{%- for tool in message['tool_calls'] -%}
{%- if not ns.is_first -%}
- {%- if not message['content'] -%}{{'<|tool▁calls▁begin|><|tool▁call▁begin|>'+ tool['function']['name'] + '<|tool▁sep|>' + tool['function']['arguments'] + '<|tool▁call▁end|>'}}
- {%- else -%}{{message['content'] + '<|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['function']['name'] + '<|tool▁sep|>' + tool['function']['arguments'] + '<|tool▁call▁end|>'}}
+ {%- if not message['content'] -%}{{'<|tool▁calls▁begin|><|tool▁call▁begin|>'+ tool['function']['name'] + '<|tool▁sep|>' + tool['function']['arguments'] | tojson + '<|tool▁call▁end|>'}}
+ {%- else -%}{{message['content'] + '<|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['function']['name'] + '<|tool▁sep|>' + tool['function']['arguments'] | tojson + '<|tool▁call▁end|>'}}
{%- endif -%}
{%- set ns.is_first = true -%}
- {%- else -%}{{'<|tool▁call▁begin|>'+ tool['function']['name'] + '<|tool▁sep|>' + tool['function']['arguments'] + '<|tool▁call▁end|>'}}
+ {%- else -%}{{'<|tool▁call▁begin|>'+ tool['function']['name'] + '<|tool▁sep|>' + tool['function']['arguments'] | tojson + '<|tool▁call▁end|>'}}
{%- endif -%}
{%- endfor -%}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}
{%- endif -%}
{%- if message['role'] == 'assistant' and not message['tool_calls'] -%}
{%- if ns.is_last_user -%}{{'<|Assistant|>'}}
{%- if message['prefix'] is defined and message['prefix'] and thinking -%}{{'<think>'}}
- {%- else -%}{{'</think>'}}
+ {%- else -%}{{'<think></think>'}}
{%- endif -%}
{%- endif -%}
{%- set ns.is_last_user = false -%}
{%- endif -%}
{%- endfor -%}
{%- if add_generation_prompt and ns.is_last_user and not ns.is_tool -%}{{'<|Assistant|>'}}
- {%- if not thinking -%}{{'</think>'}}
- {%- else -%}{{'<think>'}}
+ {%- if not thinking -%}{{'<think></think>'}}
+ {%- else -%}{{'<think>'}}
{%- endif -%}
{%- endif %}
\ No newline at end of file
{%- endif -%}
{%- set tool_name = tc['function']['name'] -%}
{%- set tool_args = tc['function']['arguments'] -%}
- {{- '<|tool▁call▁begin|>' + tc['type'] + '<|tool▁sep|>' + tool_name + '\n' + '```json' + '\n' + tool_args + '\n' + '```' + '<|tool▁call▁end|>' -}}
+ {{- '<|tool▁call▁begin|>' + tc['type'] + '<|tool▁sep|>' + tool_name + '\n' + '```json' + '\n' + tool_args | tojson + '\n' + '```' + '<|tool▁call▁end|>' -}}
{%- endfor -%}
{{- '<|tool▁calls▁end|><|end▁of▁sentence|>' -}}
{%- endif -%}
{%- if 'tool_calls' in message and message['tool_calls'] -%}
{%- for tool_call in message['tool_calls'] -%}
{%- if tool_call["function"]["name"] == "python" -%}
- {{ '<|python_tag|>' + tool_call['function']['arguments'] }}
+ {{ '<|python_tag|>' + tool_call['function']['arguments'] | tojson }}
{%- else -%}
- {{ '<function=' + tool_call['function']['name'] + '>' + tool_call['function']['arguments'] + '</function>' }}
+ {{ '<function=' + tool_call['function']['name'] + '>' + tool_call['function']['arguments'] | tojson + '</function>' }}
{%- endif -%}
{%- endfor -%}
{{ '<|eom_id|>' }}
// Check reasoning markers
t.assert_equal("reasoning_start should be '<think>'", "<think>", analysis.reasoning.start);
- t.assert_equal("reasoning_end should be '</think>\\n'", "</think>\n", analysis.reasoning.end);
+ t.assert_equal("reasoning_end should be '</think>'", "</think>", analysis.reasoning.end);
// Check reasoning mode detection
- // Nemotron uses forced closed reasoning with add_generation_prompt
- t.assert_equal("reasoning should be FORCED_CLOSED", reasoning_mode::FORCED_CLOSED, analysis.reasoning.mode);
+ // Nemotron uses tag-based reasoning; prefill handles the template's forced markers
+ t.assert_equal("reasoning should be TAG_BASED", reasoning_mode::TAG_BASED, analysis.reasoning.mode);
// Make sure reasoning markers don't spill over to content markers
t.assert_equal("content start should be empty", "", analysis.content.start);
common_reasoning_format reasoning_format;
json json_schema;
bool parallel_tool_calls;
- bool thinking_forced_open;
+ std::string generation_prompt;
std::string input;
// Expect
auto build_parser = [](const test_case & tc) {
return build_chat_peg_parser([&](common_chat_peg_builder & p) {
auto reasoning_in_content = (tc.reasoning_format == COMMON_REASONING_FORMAT_NONE);
- auto reasoning = p.eps();
- if (tc.thinking_forced_open) {
- // If thinking is forced open, expect a closing tag
- reasoning = p.reasoning(p.until("</think>")) + "</think>" + p.space();
- } else {
- // Otherwise, optionally accept thinking wrapped in tags
- reasoning = p.optional("<think>" + p.reasoning(p.until("</think>")) + "</think>" + p.space());
- }
+ // Always use optional TAG_BASED pattern; generation_prompt is prepended to input
+ auto reasoning = p.optional("<think>" + p.reasoning(p.until("</think>")) + "</think>" + p.space());
// tool calling parser
if (tc.tools.is_array() && !tc.tools.empty()) {
std::vector<test_case> test_cases = std::vector<test_case>{
{
- /* .name = */ "content with thinking_forced_open = false",
+ /* .name = */ "content with reasoning (no generation_prompt)",
/* .tools = */ {},
/* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_NONE,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
/* .json_schema = */ {},
/* .parallel_tool_calls = */ false,
- /* .thinking_forced_open = */ false,
+ /* .generation_prompt = */ "",
/* .input = */ ("<think>The user said hello, I must say hello back</think>\nHello"),
/* .expect_reasoning = */ "The user said hello, I must say hello back",
/* .expect_content = */ "Hello",
/* .expect_tool_calls = */ {},
},
{
- /* .name = */ "content with thinking_forced_open = false and no reasoning",
+ /* .name = */ "content without reasoning (no generation_prompt)",
/* .tools = */ {},
/* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_NONE,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
/* .json_schema = */ {},
/* .parallel_tool_calls = */ false,
- /* .thinking_forced_open = */ false,
+ /* .generation_prompt = */ "",
/* .input = */ ("Hello"),
/* .expect_reasoning = */ "",
/* .expect_content = */ "Hello",
/* .expect_tool_calls = */ {},
},
{
- /* .name = */ "content with thinking_forced_open = false and reasoning_format = none",
+ /* .name = */ "content with reasoning_format = none (tags appear in content)",
/* .tools = */ {},
/* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_NONE,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE,
/* .json_schema = */ {},
/* .parallel_tool_calls = */ false,
- /* .thinking_forced_open = */ true,
+ /* .generation_prompt = */ "",
/* .input = */ ("<think>The user said hello, I must say hello back</think>\nHello"),
/* .expect_reasoning = */ "",
/* .expect_content = */ "<think>The user said hello, I must say hello back</think>\nHello",
/* .expect_tool_calls = */ {},
},
{
- /* .name = */ "content with thinking_forced_open = true",
+ /* .name = */ "content with reasoning generation_prompt",
/* .tools = */ {},
/* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_NONE,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
/* .json_schema = */ {},
/* .parallel_tool_calls = */ false,
- /* .thinking_forced_open = */ true,
+ /* .generation_prompt = */ "<think>",
/* .input = */ ("The user said hello, I must say hello back</think>\nHello"),
/* .expect_reasoning = */ "The user said hello, I must say hello back",
/* .expect_content = */ "Hello",
/* .expect_tool_calls = */ {},
},
{
- /* .name = */ "content with thinking_forced_open = true and reasoning_format = none",
+ /* .name = */ "content with reasoning generation_prompt and reasoning_format = none",
/* .tools = */ {},
/* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_NONE,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE,
/* .json_schema = */ {},
/* .parallel_tool_calls = */ false,
- /* .thinking_forced_open = */ true,
+ /* .generation_prompt = */ "",
/* .input = */ ("The user said hello, I must say hello back</think>\nHello"),
/* .expect_reasoning = */ "",
/* .expect_content = */ "The user said hello, I must say hello back</think>\nHello",
/* .expect_tool_calls = */ {},
},
{
- /* .name = */ "tools with tool_choice = auto and no parallel_tool_calls",
+ /* .name = */ "content with closed reasoning generation_prompt (empty reasoning discarded)",
+ /* .tools = */ {},
+ /* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_NONE,
+ /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
+ /* .json_schema = */ {},
+ /* .parallel_tool_calls = */ false,
+ /* .generation_prompt = */ "<think></think>",
+ /* .input = */ ("Hello"),
+ /* .expect_reasoning = */ "",
+ /* .expect_content = */ "Hello",
+ /* .expect_tool_calls = */ {},
+ },
+ {
+ /* .name = */ "tools with reasoning generation_prompt",
/* .tools = */ create_tools(),
/* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_AUTO,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
/* .json_schema = */ {},
/* .parallel_tool_calls = */ false,
- /* .thinking_forced_open = */ true,
+ /* .generation_prompt = */ "<think>",
/* .input = */
("I must get the weather in New York</think>\n"
"<tool_call>["
} },
},
{
- /* .name = */ "tools with tool_choice = auto and parallel_tool_calls",
+ /* .name = */ "parallel tools with reasoning generation_prompt",
/* .tools = */ create_tools(),
/* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_AUTO,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
/* .json_schema = */ {},
/* .parallel_tool_calls = */ true,
- /* .thinking_forced_open = */ true,
+ /* .generation_prompt = */ "<think>",
/* .input = */
("I must get the weather in New York and San Francisco and a 3 day forecast of each.</think>\nLet me "
"search that for you."
} },
},
{
- /* .name = */ "response_format with thinking_forced_open = true",
+ /* .name = */ "response_format with reasoning generation_prompt",
/* .tools = */ {},
/* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_NONE,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
{ "due_date", { { "type", "string" } } } } },
{ "required", { "invoice_number", "amount", "due_date" } } },
/* .parallel_tool_calls = */ false,
- /* .thinking_forced_open = */ true,
+ /* .generation_prompt = */ "<think>",
/* .input = */
("I must produce the invoice in the requested format</think>\n"
R"({"invoice_number": "INV-2025-001", "amount": 1250.50, "due_date": "2025-12-31"})"),
t.log(line);
}
- common_peg_parse_context ctx(tc.input);
+ std::string effective_input = tc.generation_prompt + tc.input;
+ common_peg_parse_context ctx(effective_input);
auto result = parser.parse(ctx);
t.assert_true("success", result.success());
}
common_chat_msg parse(const std::string & msg, bool is_partial) const {
- common_chat_parser_params parser_params;
- parser_params.format = params_.format;
+ common_chat_parser_params parser_params(params_);
parser_params.debug = detailed_debug_;
return common_chat_peg_parse(arena_, msg, is_partial, parser_params);
}
grammar_triggered = true;
}
+ // For non-lazy grammars, prepend reasoning prefill to grammar input, just like
+ // PEG parsing does. The grammar includes the full reasoning pattern (e.g. optional
+ // <think>...</think>), but the model output may start mid-reasoning if the template
+ // already placed the opening tag in the prompt.
+ // For lazy grammars, the grammar only activates from the trigger position, so the
+ // reasoning prefill is irrelevant — reasoning is handled by the PEG parser.
+ if (!parser.params_.generation_prompt.empty() && earliest_trigger_pos == std::string::npos) {
+ constrained = parser.params_.generation_prompt + constrained;
+ }
+
// Test the constrained portion against the grammar
if (grammar_triggered && !tc.is_partial) {
auto result = match_string_detailed(constrained, grammar.get());
tst.test("[THINK]I'm\nthinking[/THINK]Hello, world!\nWhat's up?")
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
+ .enable_thinking(true)
.expect(message_assist_thoughts)
.run();
tst.test(R"([TOOL_CALLS]special_function[ARGS]{"arg1":1})")
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
+ .enable_thinking(true)
.tools({ special_function_tool })
.expect(message_assist_call)
.run();
"[THINK]I'm\nthinking[/THINK]"
R"([TOOL_CALLS]special_function[ARGS]{"arg1":1})")
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
+ .enable_thinking(true)
.tools({ special_function_tool })
.expect(message_assist_call_thoughts)
.run();
// NVIDIA Nemotron-3 Nano
auto tst = peg_tester("models/templates/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.jinja", detailed_debug);
- tst.test("Hello, world!\nWhat's up?").enable_thinking(false).expect(message_assist).run();
+ tst.test("Hello, world!\nWhat's up?").
+ enable_thinking(false).
+ reasoning_format(COMMON_REASONING_FORMAT_AUTO).
+ expect(message_assist).run();
tst.test("I'm\nthinking\n</think>\nHello, world!\nWhat's up?")
- .enable_thinking(false)
+ .enable_thinking(true)
.reasoning_format(COMMON_REASONING_FORMAT_NONE)
- .expect_content("I'm\nthinking\n</think>\nHello, world!\nWhat's up?")
+ .expect_content("<think>I'm\nthinking\n</think>\nHello, world!\nWhat's up?")
.run();
tst.test("I'm\nthinking\n</think>\nHello, world!\nWhat's up?")
.expect(simple_assist_msg("The answer is 42.", "Let me think about this..."))
.run();
- tst.test("Hello, world!").expect(simple_assist_msg("Hello, world!")).run();
+ tst.test("</think>Hello, world!").reasoning_format(COMMON_REASONING_FORMAT_AUTO).expect(simple_assist_msg("Hello, world!")).run();
}
{
// NousResearch-Hermes-2-Pro and Hermes-3 (tool calling models)
"<|tool▁calls▁begin|><|tool▁call▁begin|>get_time<|tool▁sep|>{\"city\": "
"\"XYZCITY\"}<|tool▁call▁end|><|tool▁calls▁end|>")
.tools({ get_time_tool })
+ .enable_thinking(false)
+ .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK)
.expect(message_with_tool_calls("get_time", "{\"city\":\"XYZCITY\"}"))
.run();
}
{
auto tst = peg_tester("models/templates/deepseek-ai-DeepSeek-V3.1.jinja", detailed_debug);
- tst.test("CONTENT").expect(simple_assist_msg("CONTENT", "")).run();
+ tst.test("CONTENT").enable_thinking(false).reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK).
+ expect(simple_assist_msg("CONTENT", "")).run();
}
// GLM-4.6 tests - format: <tool_call>function_name\n<arg_key>...</arg_key>\n<arg_value>...</arg_value>\n</tool_call>
"<arg_key>arg1</arg_key><arg_value>1</arg_value>"
"<arg_key>arg2</arg_key><arg_value>2</arg_value>"
"</tool_call>")
+ .enable_thinking(false)
.parallel_tool_calls(true)
.tools({
special_function_tool, special_function_tool_with_optional_param
{
auto tst = peg_tester("models/templates/MiniMax-M2.jinja", detailed_debug);
tst.test(
- "<minimax:tool_call>\n<invoke name=\"special_function\">\n<parameter "
+ "</think><minimax:tool_call>\n<invoke name=\"special_function\">\n<parameter "
"name=\"arg1\">1</parameter>\n</invoke>\n</minimax:tool_call>")
.tools({ special_function_tool })
.expect(message_assist_call)
+ .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK)
.run();
}
// Functionary v3.2 - recipient-based format: >>>recipient\n{content}
{
auto tst = peg_tester("models/templates/meetkai-functionary-medium-v3.2.jinja", detailed_debug);
- tst.test(">>>all\nHello, world!\nWhat's up?").expect(message_assist).run();
- tst.test(">>>special_function\n{\"arg1\": 1}")
+ tst.test("all\nHello, world!\nWhat's up?").expect(message_assist).run();
+ tst.test("special_function\n{\"arg1\": 1}")
.tools({ special_function_tool })
.expect(message_assist_call)
.run();
// Note: Template uses forced-open mode (prompt ends with <think>), so input shouldn't include opening tag
{
auto tst = peg_tester("models/templates/deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja", detailed_debug);
- tst.test("Hello, world!\nWhat's up?")
- .enable_thinking(true) // Forced open
+ tst.test("</think>Hello, world!\nWhat's up?")
+ .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK)
.expect(message_assist)
.run();
tst.test("I'm\nthinking</think>Hello, world!\nWhat's up?")
// llama-cpp DeepSeek R1 template (always forced-open thinking)
{
auto tst = peg_tester("models/templates/llama-cpp-deepseek-r1.jinja", detailed_debug);
- tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();
+ tst.test("</think>Hello, world!\nWhat's up?").expect(message_assist).reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK).run();
tst.test("I'm\nthinking</think>Hello, world!\nWhat's up?")
.reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK)
.expect(message_assist_thoughts)
.run();
tst.test(
- "<|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>special_function\n"
+ "</think><|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>special_function\n"
"```json\n{\"arg1\": 1}```<|tool▁call▁end|><|tool▁calls▁end|>")
+ .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK)
.tools({ special_function_tool })
.parallel_tool_calls(true)
.expect(message_assist_call)
// Note: Template uses forced-open mode (prompt ends with <think>), so input shouldn't include opening tag
{
auto tst = peg_tester("models/templates/deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja", detailed_debug);
- tst.test("Hello, world!\nWhat's up?").enable_thinking(true).expect(message_assist).run();
+ tst.test("</think>Hello, world!\nWhat's up?").enable_thinking(true).
+ reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK).
+ expect(message_assist).run();
tst.test("I'm\nthinking</think>Hello, world!\nWhat's up?")
.reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK)
.expect(message_assist_thoughts)
"<|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>special_function\n"
"```json\n{\"arg1\": 1}```<|tool▁call▁end|><|tool▁calls▁end|>")
.tools({ special_function_tool })
+ .enable_thinking(false)
+ .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK)
.expect(message_assist_call)
.run();
}
// Apriel 1.6 Thinker (reasoning-only support)
{
auto tst = peg_tester("models/templates/Apriel-1.6-15b-Thinker-fixed.jinja", detailed_debug);
- tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();
// Implicit reasoning start (forced open)
tst.test("I'm\nthinking\n[BEGIN FINAL RESPONSE]\nHello, world!\nWhat's up?")
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
- .expect(message_assist_thoughts)
+ .enable_thinking(true)
+ .expect(simple_assist_msg("Hello, world!\nWhat's up?", "Here are my reasoning steps:\nI'm\nthinking"))
.run();
// Reasoning + Tool calls
"I'm\nthinking\n[BEGIN FINAL RESPONSE]\n<tool_calls>[{\"name\": \"special_function\", \"arguments\": "
"{\"arg1\": 1}}]</tool_calls>")
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
+ .enable_thinking(true)
.tools({ special_function_tool })
- .expect(message_assist_call_thoughts)
+ .expect(simple_assist_msg("", "Here are my reasoning steps:\nI'm\nthinking", "special_function", "{\"arg1\":1}"))
.run();
}
llama_get_model(ctx_server.get_llama_context()));
task.params.sampling.reasoning_budget_tokens = reasoning_budget;
- task.params.sampling.reasoning_budget_activate_immediately = chat_params.thinking_forced_open;
+ task.params.sampling.generation_prompt = chat_params.generation_prompt;
if (!chat_params.thinking_start_tag.empty()) {
task.params.sampling.reasoning_budget_start =
LOG_ERR("Messages:\n%s\n", final_messages.dump(2).c_str());
try {
- autoparser::templates_params inputs;
+ autoparser::generation_params inputs;
inputs.messages = final_messages;
inputs.add_generation_prompt = add_generation_prompt;
inputs.extra_context["enable_thinking"] = enable_thinking;
analysis.analyze_template(chat_template);
// Generate Parser
- autoparser::templates_params params;
+ autoparser::generation_params params;
params.messages = json::array({ build_user_message() });
params.reasoning_format =
opts.enable_reasoning ? COMMON_REASONING_FORMAT_DEEPSEEK : COMMON_REASONING_FORMAT_NONE;
{
json user_msg = make_user_msg();
- autoparser::templates_params params_no_tools;
+ autoparser::generation_params params_no_tools;
params_no_tools.messages = json::array({ user_msg });
params_no_tools.add_generation_prompt = false;
params_no_tools.tools = json::array();
- autoparser::templates_params params_with_tools = params_no_tools;
+ autoparser::generation_params params_with_tools = params_no_tools;
params_with_tools.tools = tools;
std::string output_no_tools = common_chat_template_direct_apply(chat_template, params_no_tools);
{
json user_msg = make_user_msg();
- autoparser::templates_params params_no_prompt;
+ autoparser::generation_params params_no_prompt;
params_no_prompt.messages = json::array({ user_msg });
params_no_prompt.add_generation_prompt = false;
params_no_prompt.tools = json::array();
- autoparser::templates_params params_with_prompt = params_no_prompt;
+ autoparser::generation_params params_with_prompt = params_no_prompt;
params_with_prompt.add_generation_prompt = true;
std::string output_no_prompt = common_chat_template_direct_apply(chat_template, params_no_prompt);
{
json user_msg = make_user_msg();
- autoparser::templates_params params_no_reasoning;
+ autoparser::generation_params params_no_reasoning;
params_no_reasoning.messages = json::array({ user_msg, make_assistant_no_reasoning() });
params_no_reasoning.add_generation_prompt = false;
params_no_reasoning.enable_thinking = true;
- autoparser::templates_params params_with_reasoning = params_no_reasoning;
+ autoparser::generation_params params_with_reasoning = params_no_reasoning;
params_with_reasoning.messages = json::array({ user_msg, make_assistant_with_reasoning() });
std::string output_no_reasoning = common_chat_template_direct_apply(chat_template, params_no_reasoning);
json user_msg = make_user_msg();
json user_msg2 = make_user_msg2();
- autoparser::templates_params params_no_reasoning;
+ autoparser::generation_params params_no_reasoning;
params_no_reasoning.messages = json::array({ user_msg, make_assistant_no_reasoning(), user_msg2 });
params_no_reasoning.add_generation_prompt = false;
params_no_reasoning.enable_thinking = true;
- autoparser::templates_params params_with_reasoning = params_no_reasoning;
+ autoparser::generation_params params_with_reasoning = params_no_reasoning;
params_with_reasoning.messages = json::array({ user_msg, make_assistant_with_reasoning(), user_msg2 });
std::string output_no_reasoning = common_chat_template_direct_apply(chat_template, params_no_reasoning);
{
json user_msg = make_user_msg();
- autoparser::templates_params params_no_tool;
+ autoparser::generation_params params_no_tool;
params_no_tool.messages = json::array({ user_msg, make_assistant_no_tool() });
params_no_tool.add_generation_prompt = false;
params_no_tool.tools = tools;
- autoparser::templates_params params_with_tool = params_no_tool;
+ autoparser::generation_params params_with_tool = params_no_tool;
params_with_tool.messages = json::array({ user_msg, make_assistant_one_tool() });
std::string output_no_tool = common_chat_template_direct_apply(chat_template, params_no_tool);
json user_msg = make_user_msg();
json user_msg2 = make_user_msg2_continue();
- autoparser::templates_params params_no_tool;
+ autoparser::generation_params params_no_tool;
params_no_tool.messages = json::array({ user_msg, make_assistant_no_tool(), user_msg2 });
params_no_tool.add_generation_prompt = false;
params_no_tool.tools = tools;
- autoparser::templates_params params_with_tool = params_no_tool;
+ autoparser::generation_params params_with_tool = params_no_tool;
params_with_tool.messages = json::array({ user_msg, make_assistant_one_tool(), user_msg2 });
std::string output_no_tool = common_chat_template_direct_apply(chat_template, params_no_tool);
{
json user_msg = make_user_msg();
- autoparser::templates_params params_one_tool;
+ autoparser::generation_params params_one_tool;
params_one_tool.messages = json::array({ user_msg, make_assistant_one_tool() });
params_one_tool.add_generation_prompt = false;
params_one_tool.tools = tools;
- autoparser::templates_params params_two_tools = params_one_tool;
+ autoparser::generation_params params_two_tools = params_one_tool;
params_two_tools.messages = json::array({ user_msg, make_assistant_two_tools() });
std::string output_one_tool = common_chat_template_direct_apply(chat_template, params_one_tool);
json user_msg = make_user_msg();
json user_msg2 = make_user_msg2_continue();
- autoparser::templates_params params_one_tool;
+ autoparser::generation_params params_one_tool;
params_one_tool.messages = json::array({ user_msg, make_assistant_one_tool(), user_msg2 });
params_one_tool.add_generation_prompt = false;
params_one_tool.tools = tools;
- autoparser::templates_params params_two_tools = params_one_tool;
+ autoparser::generation_params params_two_tools = params_one_tool;
params_two_tools.messages = json::array({ user_msg, make_assistant_two_tools(), user_msg2 });
std::string output_one_tool = common_chat_template_direct_apply(chat_template, params_one_tool);
{
json user_msg = make_user_msg();
- autoparser::templates_params params_no_reasoning;
+ autoparser::generation_params params_no_reasoning;
params_no_reasoning.messages = json::array({ user_msg, make_assistant_one_tool() });
params_no_reasoning.add_generation_prompt = false;
params_no_reasoning.tools = tools;
params_no_reasoning.enable_thinking = true;
- autoparser::templates_params params_with_reasoning = params_no_reasoning;
+ autoparser::generation_params params_with_reasoning = params_no_reasoning;
params_with_reasoning.messages = json::array({ user_msg, make_assistant_one_tool_with_reasoning() });
std::string output_no_reasoning = common_chat_template_direct_apply(chat_template, params_no_reasoning);
"chat_format": "GPT-OSS",
"reasoning_format": "none",
"reasoning_in_content": false,
- "thinking_forced_open": false,
+ "generation_prompt": "",
"samplers": [
"penalties",
"dry",
"chat_format": "GPT-OSS",
"reasoning_format": "none",
"reasoning_in_content": false,
- "thinking_forced_open": false,
+ "generation_prompt": "",
"samplers": [
"penalties",
"dry",
`reasoning_format`: The reasoning format to be parsed. If set to `none`, it will output the raw generated text.
-`thinking_forced_open`: Force a reasoning model to always output the reasoning. Only works on certain models.
+`generation_prompt`: The generation prompt that was prefilled in by the template. Prepended to model output before parsing.
`parse_tool_calls`: Whether to parse the generated tool call.
}
}
- llama_params["chat_format"] = static_cast<int>(chat_params.format);
- llama_params["prompt"] = chat_params.prompt;
+ llama_params["chat_format"] = static_cast<int>(chat_params.format);
+ llama_params["prompt"] = chat_params.prompt;
if (!chat_params.grammar.empty()) {
- llama_params["grammar"] = chat_params.grammar;
+ llama_params["grammar"] = chat_params.grammar;
+ llama_params["grammar_type"] = std::string("tool_calls");
}
- llama_params["grammar_lazy"] = chat_params.grammar_lazy;
- auto grammar_triggers = json::array();
+ llama_params["grammar_lazy"] = chat_params.grammar_lazy;
+ auto grammar_triggers = json::array();
for (const auto & trigger : chat_params.grammar_triggers) {
server_grammar_trigger ct(trigger);
grammar_triggers.push_back(ct.to_json());
}
- llama_params["grammar_triggers"] = grammar_triggers;
- llama_params["preserved_tokens"] = chat_params.preserved_tokens;
- llama_params["thinking_forced_open"] = chat_params.thinking_forced_open;
+ llama_params["grammar_triggers"] = grammar_triggers;
+ llama_params["preserved_tokens"] = chat_params.preserved_tokens;
+ llama_params["generation_prompt"] = chat_params.generation_prompt;
for (const auto & stop : chat_params.additional_stops) {
llama_params["stop"].push_back(stop);
}
llama_params["reasoning_budget_start_tag"] = chat_params.thinking_start_tag;
llama_params["reasoning_budget_end_tag"] = chat_params.thinking_end_tag;
llama_params["reasoning_budget_message"] = opt.reasoning_budget_message;
- llama_params["reasoning_budget_activate_immediately"] = chat_params.thinking_forced_open;
}
}
#include <algorithm>
#include <cstddef>
#include <cinttypes>
+#include <exception>
#include <memory>
#include <filesystem>
// initialize samplers
if (task.need_sampling()) {
- slot.smpl.reset(common_sampler_init(model, task.params.sampling));
-
- if (slot.smpl == nullptr) {
- // for now, the only error that may happen here is invalid grammar
- send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST);
+ try {
+ slot.smpl.reset(common_sampler_init(model, task.params.sampling));
+ } catch (std::exception & e) {
+ std::string err_msg = std::string("Failed to initialize samplers: ") + e.what();
+ send_error(task, err_msg, ERROR_TYPE_INVALID_REQUEST);
return false;
}
{"chat_format", common_chat_format_name(chat_parser_params.format)},
{"reasoning_format", common_reasoning_format_name(chat_parser_params.reasoning_format)},
{"reasoning_in_content", chat_parser_params.reasoning_in_content},
- {"thinking_forced_open", chat_parser_params.thinking_forced_open},
+ {"generation_prompt", chat_parser_params.generation_prompt},
{"samplers", samplers},
{"speculative.n_max", speculative.n_max},
{"speculative.n_min", speculative.n_min},
{"logit_bias", format_logit_bias(sampling.logit_bias)},
{"n_probs", sampling.n_probs},
{"min_keep", sampling.min_keep},
- {"grammar", sampling.grammar},
+ {"grammar", common_grammar_value(sampling.grammar)},
{"grammar_lazy", sampling.grammar_lazy},
{"grammar_triggers", grammar_triggers},
{"preserved_tokens", sampling.preserved_tokens},
{"chat_format", common_chat_format_name(chat_parser_params.format)},
{"reasoning_format", common_reasoning_format_name(chat_parser_params.reasoning_format)},
{"reasoning_in_content", chat_parser_params.reasoning_in_content},
- {"thinking_forced_open", chat_parser_params.thinking_forced_open},
+ {"generation_prompt", chat_parser_params.generation_prompt},
{"samplers", samplers},
{"speculative.n_max", speculative.n_max},
{"speculative.n_min", speculative.n_min},
try {
auto schema = json_value(data, "json_schema", json::object());
SRV_DBG("JSON schema: %s\n", schema.dump(2).c_str());
- params.sampling.grammar = json_schema_to_grammar(schema);
- SRV_DBG("Converted grammar: %s\n", params.sampling.grammar.c_str());
+ std::string grammar_str = json_schema_to_grammar(schema);
+ SRV_DBG("Converted grammar: %s\n", grammar_str.c_str());
+ params.sampling.grammar = {COMMON_GRAMMAR_TYPE_OUTPUT_FORMAT, std::move(grammar_str)};
} catch (const std::exception & e) {
throw std::runtime_error(std::string("\"json_schema\": ") + e.what());
}
} else {
- params.sampling.grammar = json_value(data, "grammar", defaults.sampling.grammar);
- SRV_DBG("Grammar: %s\n", params.sampling.grammar.c_str());
+ std::string grammar_str = json_value(data, "grammar", std::string());
+ if (!grammar_str.empty()) {
+ // grammar_type key is set by the server when converting chat template grammars
+ std::string grammar_type = json_value(data, "grammar_type", std::string());
+ if (grammar_type == "tool_calls") {
+ params.sampling.grammar = {COMMON_GRAMMAR_TYPE_TOOL_CALLS, std::move(grammar_str)};
+ } else {
+ // explicit grammar from the user (API field "grammar")
+ params.sampling.grammar = {COMMON_GRAMMAR_TYPE_USER, std::move(grammar_str)};
+ }
+ SRV_DBG("Grammar (%s): %s\n", grammar_type.c_str(), common_grammar_value(params.sampling.grammar).c_str());
+ }
params.sampling.grammar_lazy = json_value(data, "grammar_lazy", defaults.sampling.grammar_lazy);
SRV_DBG("Grammar lazy: %s\n", params.sampling.grammar_lazy ? "true" : "false");
}
}
params.chat_parser_params.reasoning_format = reasoning_format;
params.chat_parser_params.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
- params.chat_parser_params.thinking_forced_open = json_value(data, "thinking_forced_open", false);
+ params.chat_parser_params.generation_prompt = json_value(data, "generation_prompt", std::string());
+ params.sampling.generation_prompt = params.chat_parser_params.generation_prompt;
params.chat_parser_params.parse_tool_calls = json_value(data, "parse_tool_calls", false);
if (data.contains("chat_parser")) {
params.chat_parser_params.parser.load(data.at("chat_parser").get<std::string>());
const auto start_tag = json_value(data, "reasoning_budget_start_tag", std::string());
const auto end_tag = json_value(data, "reasoning_budget_end_tag", std::string());
const auto message = json_value(data, "reasoning_budget_message", std::string());
- const bool activate_imm = json_value(data, "reasoning_budget_activate_immediately", false);
-
params.sampling.reasoning_budget_tokens = budget;
- params.sampling.reasoning_budget_activate_immediately = activate_imm;
if (!start_tag.empty()) {
params.sampling.reasoning_budget_start = common_tokenize(vocab, start_tag, false, true);
params.sampling.reasoning_budget_forced = common_tokenize(vocab, message + end_tag, false, true);
}
- SRV_DBG("reasoning budget: tokens=%d, activate_immediately=%s, start=%zu toks, end=%zu toks, forced=%zu toks\n",
- budget, activate_imm ? "true" : "false",
+ SRV_DBG("reasoning budget: tokens=%d, generation_prompt='%s', start=%zu toks, end=%zu toks, forced=%zu toks\n",
+ budget, params.sampling.generation_prompt.c_str(),
params.sampling.reasoning_budget_start.size(),
params.sampling.reasoning_budget_end.size(),
params.sampling.reasoning_budget_forced.size());
def test_completion_with_json_schema(jinja: bool, json_schema: dict, n_predicted: int, re_content: str):
global server
server.jinja = jinja
+ server.debug = True
server.start()
res = server.make_request("POST", "/chat/completions", data={
"max_tokens": n_predicted,
chat_format: '',
reasoning_format: '',
reasoning_in_content: false,
- thinking_forced_open: false,
+ generation_prompt: '',
'speculative.n_max': 0,
'speculative.n_min': 0,
'speculative.p_min': 0.0,
chat_format: '',
reasoning_format: '',
reasoning_in_content: false,
- thinking_forced_open: false,
+ generation_prompt: '',
'speculative.n_max': 0,
'speculative.n_min': 0,
'speculative.p_min': 0.0,
chat_format: string;
reasoning_format: string;
reasoning_in_content: boolean;
- thinking_forced_open: boolean;
+ generation_prompt: string;
samplers: string[];
backend_sampling: boolean;
'speculative.n_max': number;
chat_format: string;
reasoning_format: string;
reasoning_in_content: boolean;
- thinking_forced_open: boolean;
+ generation_prompt: string;
samplers: string[];
backend_sampling: boolean;
'speculative.n_max': number;