json extra_context;
bool add_bos;
bool add_eos;
+ bool is_inference = true;
};
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
common_chat_params data;
auto prompt = apply(tmpl, inputs);
+ // Check if we need to replace the return token with end token during
+ // inference and without generation prompt. For more details see:
+ // https://github.com/ggml-org/llama.cpp/issues/15417
+ if (inputs.is_inference && !inputs.add_generation_prompt) {
+ static constexpr std::string_view return_token = "<|return|>";
+ static constexpr std::string_view end_token = "<|end|>";
+ if (size_t pos = prompt.rfind(return_token); pos != std::string::npos) {
+ prompt.replace(pos, return_token.length(), end_token);
+ }
+ }
+
data.prompt = prompt;
data.format = COMMON_CHAT_FORMAT_GPT_OSS;