chat: handle gpt-oss return/end token inconsistency (#15421)

author Daniel Bevenius <redacted>

Wed, 20 Aug 2025 12:26:01 +0000 (14:26 +0200)

committer GitHub <redacted>

Wed, 20 Aug 2025 12:26:01 +0000 (14:26 +0200)
author Daniel Bevenius <redacted>
Wed, 20 Aug 2025 12:26:01 +0000 (14:26 +0200)
committer GitHub <redacted>
Wed, 20 Aug 2025 12:26:01 +0000 (14:26 +0200)
diff --git a/common/chat.cpp b/common/chat.cpp

index 5fe5643d38356a6f644a9e308b0b1489d0dbfaef..7f6809a4edc41cb66b041535c723df330b1f4deb 100644 (file)
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -147,6 +147,7 @@ struct templates_params {
      json extra_context;
      bool add_bos;
      bool add_eos;
+    bool is_inference = true;
  };
  
  common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
@@ -1336,6 +1337,17 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
      common_chat_params data;
      auto prompt = apply(tmpl, inputs);
  
+    // Check if we need to replace the return token with end token during
+    // inference and without generation prompt. For more details see:
+    // https://github.com/ggml-org/llama.cpp/issues/15417
+    if (inputs.is_inference && !inputs.add_generation_prompt) {
+        static constexpr std::string_view return_token = "<|return|>";
+        static constexpr std::string_view end_token    = "<|end|>";
+        if (size_t pos = prompt.rfind(return_token); pos != std::string::npos) {
+            prompt.replace(pos, return_token.length(), end_token);
+        }
+    }
+
      data.prompt = prompt;
      data.format = COMMON_CHAT_FORMAT_GPT_OSS;
author	Daniel Bevenius <redacted>
	Wed, 20 Aug 2025 12:26:01 +0000 (14:26 +0200)
committer	GitHub <redacted>
	Wed, 20 Aug 2025 12:26:01 +0000 (14:26 +0200)