refactor: centralize CoT parsing in backend for streaming mode (#16394)

author Pascal <redacted>

Wed, 8 Oct 2025 20:18:41 +0000 (22:18 +0200)

committer GitHub <redacted>

Wed, 8 Oct 2025 20:18:41 +0000 (23:18 +0300)
author Pascal <redacted>
Wed, 8 Oct 2025 20:18:41 +0000 (22:18 +0200)
committer GitHub <redacted>
Wed, 8 Oct 2025 20:18:41 +0000 (23:18 +0300)
diff --git a/common/arg.cpp b/common/arg.cpp

index ecc296485cb4748cd71d0903f2be2cbbd4ccf0c0..4204f6c6908fbd57c2007e77d8d88aea2e1c6263 100644 (file)
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -3432,7 +3432,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
          {"--reasoning-format"}, "FORMAT",
          "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
          "- none: leaves thoughts unparsed in `message.content`\n"
-        "- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
+        "- deepseek: puts thoughts in `message.reasoning_content`\n"
+        "- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`\n"
          "(default: auto)",
          [](common_params & params, const std::string & value) {
              params.reasoning_format = common_reasoning_format_from_name(value);
diff --git a/common/chat-parser.cpp b/common/chat-parser.cpp

index b3362519a68f3a243c51562d4ab7652ad35eddcf..7365782e7d6d8670c31ec3417562a986d559b6c9 100644 (file)
--- a/common/chat-parser.cpp
+++ b/common/chat-parser.cpp
@@ -3,9 +3,12 @@
  #include "log.h"
  #include "regex-partial.h"
  
+#include <algorithm>
+#include <cctype>
  #include <optional>
  #include <stdexcept>
  #include <string>
+#include <string_view>
  #include <vector>
  
  using json = nlohmann::ordered_json;
@@ -166,6 +169,27 @@ void common_chat_msg_parser::consume_literal(const std::string & literal) {
  }
  
  bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think, const std::string & end_think) {
+    std::string pending_reasoning_prefix;
+
+    if (syntax_.reasoning_format == COMMON_REASONING_FORMAT_NONE) {
+        return false;
+    }
+
+    auto set_reasoning_prefix = [&](size_t prefix_pos) {
+        if (!syntax_.thinking_forced_open || syntax_.reasoning_in_content) {
+            return;
+        }
+        if (prefix_pos + start_think.size() > input_.size()) {
+            pending_reasoning_prefix.clear();
+            return;
+        }
+        // Capture the exact literal that opened the reasoning section so we can
+        // surface it back to callers. This ensures formats that force the
+        // reasoning tag open (e.g. DeepSeek R1) retain their original prefix
+        // instead of dropping it during parsing.
+        pending_reasoning_prefix = input_.substr(prefix_pos, start_think.size());
+    };
+
      auto handle_reasoning = [&](const std::string & reasoning, bool closed) {
          auto stripped_reasoning = string_strip(reasoning);
          if (stripped_reasoning.empty()) {
@@ -178,28 +202,116 @@ bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think
                  add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "</think>" : end_think);
              }
          } else {
+            if (!pending_reasoning_prefix.empty()) {
+                add_reasoning_content(pending_reasoning_prefix);
+                pending_reasoning_prefix.clear();
+            }
              add_reasoning_content(stripped_reasoning);
          }
      };
-    if (syntax_.reasoning_format != COMMON_REASONING_FORMAT_NONE) {
-        if (syntax_.thinking_forced_open || try_consume_literal(start_think)) {
-            if (auto res = try_find_literal(end_think)) {
-                handle_reasoning(res->prelude, /* closed */ true);
-                consume_spaces();
-                return true;
-            }
-            auto rest = consume_rest();
+
+    const size_t saved_pos = pos_;
+    const size_t saved_content_size = result_.content.size();
+    const size_t saved_reasoning_size = result_.reasoning_content.size();
+
+    auto restore_state = [&]() {
+        move_to(saved_pos);
+        result_.content.resize(saved_content_size);
+        result_.reasoning_content.resize(saved_reasoning_size);
+    };
+
+    // Allow leading whitespace to be preserved as content when reasoning is present at the start
+    size_t cursor = pos_;
+    size_t whitespace_end = cursor;
+    while (whitespace_end < input_.size() && std::isspace(static_cast<unsigned char>(input_[whitespace_end]))) {
+        ++whitespace_end;
+    }
+
+    if (whitespace_end >= input_.size()) {
+        restore_state();
+        if (syntax_.thinking_forced_open) {
+            auto rest = input_.substr(saved_pos);
              if (!rest.empty()) {
                  handle_reasoning(rest, /* closed */ !is_partial());
              }
-            // Allow unclosed thinking tags, for now (https://github.com/ggml-org/llama.cpp/issues/13812, https://github.com/ggml-org/llama.cpp/issues/13877)
-            // if (!syntax_.thinking_forced_open) {
-            //     throw common_chat_msg_partial_exception(end_think);
-            // }
+            move_to(input_.size());
              return true;
          }
+        return false;
+    }
+
+    cursor = whitespace_end;
+    const size_t remaining = input_.size() - cursor;
+    const size_t start_prefix = std::min(start_think.size(), remaining);
+    const bool has_start_tag = input_.compare(cursor, start_prefix, start_think, 0, start_prefix) == 0;
+
+    if (has_start_tag && start_prefix < start_think.size()) {
+        move_to(input_.size());
+        return true;
+    }
+
+    if (has_start_tag) {
+        if (whitespace_end > pos_) {
+            add_content(input_.substr(pos_, whitespace_end - pos_));
+        }
+        set_reasoning_prefix(cursor);
+        cursor += start_think.size();
+    } else if (syntax_.thinking_forced_open) {
+        cursor = whitespace_end;
+    } else {
+        restore_state();
+        return false;
+    }
+    while (true) {
+        if (cursor >= input_.size()) {
+            move_to(input_.size());
+            return true;
+        }
+
+        size_t end_pos = input_.find(end_think, cursor);
+        if (end_pos == std::string::npos) {
+            std::string_view remaining_view(input_.data() + cursor, input_.size() - cursor);
+            size_t partial_off = string_find_partial_stop(remaining_view, end_think);
+            size_t reasoning_end = partial_off == std::string::npos ? input_.size() : cursor + partial_off;
+            if (reasoning_end > cursor) {
+                handle_reasoning(input_.substr(cursor, reasoning_end - cursor), /* closed */ partial_off == std::string::npos && !is_partial());
+            }
+            move_to(input_.size());
+            return true;
+        }
+
+        if (end_pos > cursor) {
+            handle_reasoning(input_.substr(cursor, end_pos - cursor), /* closed */ true);
+        } else {
+            handle_reasoning("", /* closed */ true);
+        }
+
+        cursor = end_pos + end_think.size();
+
+        while (cursor < input_.size() && std::isspace(static_cast<unsigned char>(input_[cursor]))) {
+            ++cursor;
+        }
+
+        const size_t next_remaining = input_.size() - cursor;
+        if (next_remaining == 0) {
+            move_to(cursor);
+            return true;
+        }
+
+        const size_t next_prefix = std::min(start_think.size(), next_remaining);
+        if (input_.compare(cursor, next_prefix, start_think, 0, next_prefix) == 0) {
+            if (next_prefix < start_think.size()) {
+                move_to(input_.size());
+                return true;
+            }
+            set_reasoning_prefix(cursor);
+            cursor += start_think.size();
+            continue;
+        }
+
+        move_to(cursor);
+        return true;
      }
-    return false;
  }
  
  std::string common_chat_msg_parser::consume_rest() {
diff --git a/common/chat.cpp b/common/chat.cpp

index afbb2a2bdd3c41233da3112127da473980d897d3..8587140e1ff0ae78a4bed7238a75a94fba3ba8ab 100644 (file)
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -1408,6 +1408,8 @@ static common_chat_params common_chat_params_init_apertus(const common_chat_temp
      return data;
  }
  static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool with_builtin_tools = false) {
+    builder.try_parse_reasoning("<think>", "</think>");
+
      if (!builder.syntax().parse_tool_calls) {
          builder.add_content(builder.consume_rest());
          return;
@@ -2862,6 +2864,7 @@ common_chat_params common_chat_templates_apply(
  }
  
  static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
+    builder.try_parse_reasoning("<think>", "</think>");
      builder.add_content(builder.consume_rest());
  }
  
diff --git a/common/common.h b/common/common.h

index 8a8ecd667f2cc7375e9febee5c761cf19cd8a1c0..0d3638c9c6228c3fce31aba1c8e22a8ea7645d86 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -433,7 +433,7 @@ struct common_params {
      std::string chat_template = "";                                                                         // NOLINT
      bool use_jinja = false;                                                                                 // NOLINT
      bool enable_chat_template = true;
-    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO;
+    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
      int reasoning_budget = -1;
      bool prefill_assistant = true;                                                                          // if true, any trailing assistant message will be prefilled into the response
  
diff --git a/tests/test-chat-parser.cpp b/tests/test-chat-parser.cpp

index 547ebb4871cd43ca5736bea446ee9111a9bf2c6a..0b275befb8bf4c6837ad2dc644f111d7b45ae006 100644 (file)
--- a/tests/test-chat-parser.cpp
+++ b/tests/test-chat-parser.cpp
@@ -106,6 +106,34 @@ static void test_reasoning() {
      assert_equals("<think>Cogito</think>", builder.result().content);
      assert_equals("Ergo sum", builder.consume_rest());
    }
+  {
+    const std::string variant("content_only_inline_think");
+    common_chat_syntax syntax = {
+        /* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY,
+        /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+        /* .reasoning_in_content = */ false,
+        /* .thinking_forced_open = */ false,
+        /* .parse_tool_calls = */ false,
+    };
+    const std::string input = "<think>Pense</think>Bonjour";
+    auto msg = common_chat_parse(input, false, syntax);
+    assert_equals(variant, std::string("Pense"), msg.reasoning_content);
+    assert_equals(variant, std::string("Bonjour"), msg.content);
+  }
+  {
+    const std::string variant("llama_3_inline_think");
+    common_chat_syntax syntax = {
+        /* .format = */ COMMON_CHAT_FORMAT_LLAMA_3_X,
+        /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+        /* .reasoning_in_content = */ false,
+        /* .thinking_forced_open = */ false,
+        /* .parse_tool_calls = */ false,
+    };
+    const std::string input = "<think>Plan</think>Réponse";
+    auto msg = common_chat_parse(input, false, syntax);
+    assert_equals(variant, std::string("Plan"), msg.reasoning_content);
+    assert_equals(variant, std::string("Réponse"), msg.content);
+  }
    // Test DeepSeek V3.1 parsing - reasoning content followed by "</think>" and then regular content
    {
      common_chat_syntax syntax = {
diff --git a/tools/server/README.md b/tools/server/README.md

index e23b122ab148ccec3905e1547b25a71d05199010..f5ab9236d52167e34cebf869f78b1ba67eb17ec8 100644 (file)
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -190,7 +190,7 @@ The project is under active development, and we are [looking for feedback and co
  | `--no-slots` | disables slots monitoring endpoint<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
  | `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
  | `--jinja` | use jinja template for chat (default: disabled)<br/>(env: LLAMA_ARG_JINJA) |
-| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
+| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: deepseek)<br/>(env: LLAMA_ARG_THINK) |
  | `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
  | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
  | `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte

index c923bf9e040deb3f133189bb07813c2ce9f90dd0..fed0cf712695ffeb336965753b2747a6e4fa1cee 100644 (file)
--- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte
@@ -1,7 +1,6 @@
  <script lang="ts">
         import { getDeletionInfo } from '$lib/stores/chat.svelte';
         import { copyToClipboard } from '$lib/utils/copy';
-       import { parseThinkingContent } from '$lib/utils/thinking';
         import ChatMessageAssistant from './ChatMessageAssistant.svelte';
         import ChatMessageUser from './ChatMessageUser.svelte';
  
@@ -47,26 +46,13 @@
  
         let thinkingContent = $derived.by(() => {
                 if (message.role === 'assistant') {
-                       if (message.thinking) {
-                               return message.thinking;
-                       }
-
-                       const parsed = parseThinkingContent(message.content);
+                       const trimmedThinking = message.thinking?.trim();
  
-                       return parsed.thinking;
+                       return trimmedThinking ? trimmedThinking : null;
                 }
                 return null;
         });
  
-       let messageContent = $derived.by(() => {
-               if (message.role === 'assistant') {
-                       const parsed = parseThinkingContent(message.content);
-                       return parsed.cleanContent?.replace('<|channel|>analysis', '');
-               }
-
-               return message.content?.replace('<|channel|>analysis', '');
-       });
-
         function handleCancelEdit() {
                 isEditing = false;
                 editedContent = message.content;
@@ -165,7 +151,7 @@
                 {editedContent}
                 {isEditing}
                 {message}
-               {messageContent}
+               messageContent={message.content}
                 onCancelEdit={handleCancelEdit}
                 onConfirmDelete={handleConfirmDelete}
                 onCopy={handleCopy}
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte

index ad3ffa3792d5888959d2c0d0927d7033bc6dbffb..5539ed9e21c349a1fd232c46ceda8cede797f3ce 100644 (file)
--- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte
@@ -131,7 +131,11 @@
                         </div>
                 </div>
         {:else if message.role === 'assistant'}
-               <MarkdownContent content={messageContent || ''} />
+               {#if config().disableReasoningFormat}
+                       <pre class="raw-output">{messageContent || ''}</pre>
+               {:else}
+                       <MarkdownContent content={messageContent || ''} />
+               {/if}
         {:else}
                 <div class="text-sm whitespace-pre-wrap">
                         {messageContent}
@@ -203,4 +207,21 @@
                         background-position: -200% 0;
                 }
         }
+
+       .raw-output {
+               width: 100%;
+               max-width: 48rem;
+               margin-top: 1.5rem;
+               padding: 1rem 1.25rem;
+               border-radius: 1rem;
+               background: hsl(var(--muted) / 0.3);
+               color: var(--foreground);
+               font-family:
+                       ui-monospace, SFMono-Regular, 'SF Mono', Monaco, 'Cascadia Code', 'Roboto Mono', Consolas,
+                       'Liberation Mono', Menlo, monospace;
+               font-size: 0.875rem;
+               line-height: 1.6;
+               white-space: pre-wrap;
+               word-break: break-word;
+       }
  </style>
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsDialog.svelte b/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsDialog.svelte

index d832abc2e2831c53e760e4fccf65d9d3c9d75e10..7c25e5925882961dfd5bef903004fe355200d941 100644 (file)
--- a/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsDialog.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsDialog.svelte
@@ -148,6 +148,12 @@
                                         key: 'showThoughtInProgress',
                                         label: 'Show thought in progress',
                                         type: 'checkbox'
+                               },
+                               {
+                                       key: 'disableReasoningFormat',
+                                       label:
+                                               'Show raw LLM output without backend parsing and frontend Markdown rendering to inspect streaming across different models.',
+                                       type: 'checkbox'
                                 }
                         ]
                 },
diff --git a/tools/server/webui/src/lib/constants/settings-config.ts b/tools/server/webui/src/lib/constants/settings-config.ts

index 5cb22f4c34082f6f8f5ef97f33d52b908ee34918..63e4364ae5656a4809d4d5a893f9e2edabb2474d 100644 (file)
--- a/tools/server/webui/src/lib/constants/settings-config.ts
+++ b/tools/server/webui/src/lib/constants/settings-config.ts
@@ -6,6 +6,7 @@ export const SETTING_CONFIG_DEFAULT: Record<string, string | number | boolean> =
         theme: 'system',
         showTokensPerSecond: false,
         showThoughtInProgress: false,
+       disableReasoningFormat: false,
         keepStatsVisible: false,
         askForTitleConfirmation: false,
         pasteLongTextToFileLen: 2500,
@@ -76,6 +77,8 @@ export const SETTING_CONFIG_INFO: Record<string, string> = {
         custom: 'Custom JSON parameters to send to the API. Must be valid JSON format.',
         showTokensPerSecond: 'Display generation speed in tokens per second during streaming.',
         showThoughtInProgress: 'Expand thought process by default when generating messages.',
+       disableReasoningFormat:
+               'Show raw LLM output without backend parsing and frontend Markdown rendering to inspect streaming across different models.',
         keepStatsVisible: 'Keep processing statistics visible after generation finishes.',
         askForTitleConfirmation:
                 'Ask for confirmation before automatically changing conversation title when editing the first message.',
diff --git a/tools/server/webui/src/lib/services/chat.ts b/tools/server/webui/src/lib/services/chat.ts

index 369cdf4e8b9359e09c6aea7cac6aa338ecec67d9..e1dad24a8f2460e52728970188a7da9581f3bc16 100644 (file)
--- a/tools/server/webui/src/lib/services/chat.ts
+++ b/tools/server/webui/src/lib/services/chat.ts
@@ -78,6 +78,8 @@ export class ChatService {
                         timings_per_token
                 } = options;
  
+               const currentConfig = config();
+
                 // Cancel any ongoing request and create a new abort controller
                 this.abort();
                 this.abortController = new AbortController();
@@ -117,7 +119,7 @@ export class ChatService {
                         stream
                 };
  
-               requestBody.reasoning_format = 'auto';
+               requestBody.reasoning_format = currentConfig.disableReasoningFormat ? 'none' : 'auto';
  
                 if (temperature !== undefined) requestBody.temperature = temperature;
                 // Set max_tokens to -1 (infinite) if not provided or empty
@@ -161,7 +163,6 @@ export class ChatService {
                 }
  
                 try {
-                       const currentConfig = config();
                         const apiKey = currentConfig.apiKey?.toString().trim();
  
                         const response = await fetch(`./v1/chat/completions`, {
@@ -256,10 +257,8 @@ export class ChatService {
                 }
  
                 const decoder = new TextDecoder();
-               let fullResponse = '';
+               let aggregatedContent = '';
                 let fullReasoningContent = '';
-               let regularContent = '';
-               let insideThinkTag = false;
                 let hasReceivedData = false;
                 let lastTimings: ChatMessageTimings | undefined;
  
@@ -277,7 +276,7 @@ export class ChatService {
                                         if (line.startsWith('data: ')) {
                                                 const data = line.slice(6);
                                                 if (data === '[DONE]') {
-                                                       if (!hasReceivedData && fullResponse.length === 0) {
+                                                       if (!hasReceivedData && aggregatedContent.length === 0) {
                                                                 const contextError = new Error(
                                                                         'The request exceeds the available context size. Try increasing the context size or enable context shift.'
                                                                 );
@@ -286,7 +285,7 @@ export class ChatService {
                                                                 return;
                                                         }
  
-                                                       onComplete?.(regularContent, fullReasoningContent || undefined, lastTimings);
+                                                       onComplete?.(aggregatedContent, fullReasoningContent || undefined, lastTimings);
  
                                                         return;
                                                 }
@@ -310,27 +309,8 @@ export class ChatService {
  
                                                         if (content) {
                                                                 hasReceivedData = true;
-                                                               fullResponse += content;
-
-                                                               // Track the regular content before processing this chunk
-                                                               const regularContentBefore = regularContent;
-
-                                                               // Process content character by character to handle think tags
-                                                               insideThinkTag = this.processContentForThinkTags(
-                                                                       content,
-                                                                       insideThinkTag,
-                                                                       () => {
-                                                                               // Think content is ignored - we don't include it in API requests
-                                                                       },
-                                                                       (regularChunk) => {
-                                                                               regularContent += regularChunk;
-                                                                       }
-                                                               );
-
-                                                               const newRegularContent = regularContent.slice(regularContentBefore.length);
-                                                               if (newRegularContent) {
-                                                                       onChunk?.(newRegularContent);
-                                                               }
+                                                               aggregatedContent += content;
+                                                               onChunk?.(content);
                                                         }
  
                                                         if (reasoningContent) {
@@ -345,7 +325,7 @@ export class ChatService {
                                 }
                         }
  
-                       if (!hasReceivedData && fullResponse.length === 0) {
+                       if (!hasReceivedData && aggregatedContent.length === 0) {
                                 const contextError = new Error(
                                         'The request exceeds the available context size. Try increasing the context size or enable context shift.'
                                 );
@@ -552,51 +532,6 @@ export class ChatService {
                 }
         }
  
-       /**
-        * Processes content to separate thinking tags from regular content.
-        * Parses <think> and </think> tags to route content to appropriate handlers.
-        *
-        * @param content - The content string to process
-        * @param currentInsideThinkTag - Current state of whether we're inside a think tag
-        * @param addThinkContent - Callback to handle content inside think tags
-        * @param addRegularContent - Callback to handle regular content outside think tags
-        * @returns Boolean indicating if we're still inside a think tag after processing
-        * @private
-        */
-       private processContentForThinkTags(
-               content: string,
-               currentInsideThinkTag: boolean,
-               addThinkContent: (chunk: string) => void,
-               addRegularContent: (chunk: string) => void
-       ): boolean {
-               let i = 0;
-               let insideThinkTag = currentInsideThinkTag;
-
-               while (i < content.length) {
-                       if (!insideThinkTag && content.substring(i, i + 7) === '<think>') {
-                               insideThinkTag = true;
-                               i += 7; // Skip the <think> tag
-                               continue;
-                       }
-
-                       if (insideThinkTag && content.substring(i, i + 8) === '</think>') {
-                               insideThinkTag = false;
-                               i += 8; // Skip the </think> tag
-                               continue;
-                       }
-
-                       if (insideThinkTag) {
-                               addThinkContent(content[i]);
-                       } else {
-                               addRegularContent(content[i]);
-                       }
-
-                       i++;
-               }
-
-               return insideThinkTag;
-       }
-
         /**
          * Aborts any ongoing chat completion request.
          * Cancels the current request and cleans up the abort controller.
diff --git a/tools/server/webui/src/lib/stores/chat.svelte.ts b/tools/server/webui/src/lib/stores/chat.svelte.ts

index 5bf964f9adfd465d0989c35b678ba93a4da39e11..9bac1b9d36f9bfa46046fbe320dd6b8b777cd139 100644 (file)
--- a/tools/server/webui/src/lib/stores/chat.svelte.ts
+++ b/tools/server/webui/src/lib/stores/chat.svelte.ts
@@ -5,7 +5,6 @@ import { config } from '$lib/stores/settings.svelte';
  import { filterByLeafNodeId, findLeafNode, findDescendantMessages } from '$lib/utils/branching';
  import { browser } from '$app/environment';
  import { goto } from '$app/navigation';
-import { extractPartialThinking } from '$lib/utils/thinking';
  import { toast } from 'svelte-sonner';
  import type { ExportedConversations } from '$lib/types/database';
  
@@ -344,11 +343,9 @@ class ChatStore {
                                 this.currentResponse = streamedContent;
  
                                 captureModelIfNeeded();
-
-                               const partialThinking = extractPartialThinking(streamedContent);
                                 const messageIndex = this.findMessageIndex(assistantMessage.id);
                                 this.updateMessageAtIndex(messageIndex, {
-                                       content: partialThinking.remainingContent || streamedContent
+                                       content: streamedContent
                                 });
                         },
  
@@ -696,18 +693,16 @@ class ChatStore {
  
                 if (lastMessage && lastMessage.role === 'assistant') {
                         try {
-                               const partialThinking = extractPartialThinking(this.currentResponse);
-
                                 const updateData: {
                                         content: string;
                                         thinking?: string;
                                         timings?: ChatMessageTimings;
                                 } = {
-                                       content: partialThinking.remainingContent || this.currentResponse
+                                       content: this.currentResponse
                                 };
  
-                               if (partialThinking.thinking) {
-                                       updateData.thinking = partialThinking.thinking;
+                               if (lastMessage.thinking?.trim()) {
+                                       updateData.thinking = lastMessage.thinking;
                                 }
  
                                 const lastKnownState = await slotsService.getCurrentState();
@@ -727,7 +722,10 @@ class ChatStore {
  
                                 await DatabaseStore.updateMessage(lastMessage.id, updateData);
  
-                               lastMessage.content = partialThinking.remainingContent || this.currentResponse;
+                               lastMessage.content = this.currentResponse;
+                               if (updateData.thinking !== undefined) {
+                                       lastMessage.thinking = updateData.thinking;
+                               }
                                 if (updateData.timings) {
                                         lastMessage.timings = updateData.timings;
                                 }
diff --git a/tools/server/webui/src/lib/utils/thinking.ts b/tools/server/webui/src/lib/utils/thinking.ts

deleted file mode 100644 (file)

index bed13fc..0000000
--- a/tools/server/webui/src/lib/utils/thinking.ts
+++ /dev/null
@@ -1,143 +0,0 @@
-/**
- * Parses thinking content from a message that may contain <think> tags or [THINK] tags
- * Returns an object with thinking content and cleaned message content
- * Handles both complete blocks and incomplete blocks (streaming)
- * Supports formats: <think>...</think> and [THINK]...[/THINK]
- * @param content - The message content to parse
- * @returns An object containing the extracted thinking content and the cleaned message content
- */
-export function parseThinkingContent(content: string): {
-       thinking: string | null;
-       cleanContent: string;
-} {
-       const incompleteThinkMatch = content.includes('<think>') && !content.includes('</think>');
-       const incompleteThinkBracketMatch = content.includes('[THINK]') && !content.includes('[/THINK]');
-
-       if (incompleteThinkMatch) {
-               const cleanContent = content.split('</think>')?.[1]?.trim();
-               const thinkingContent = content.split('<think>')?.[1]?.trim();
-
-               return {
-                       cleanContent,
-                       thinking: thinkingContent
-               };
-       }
-
-       if (incompleteThinkBracketMatch) {
-               const cleanContent = content.split('[/THINK]')?.[1]?.trim();
-               const thinkingContent = content.split('[THINK]')?.[1]?.trim();
-
-               return {
-                       cleanContent,
-                       thinking: thinkingContent
-               };
-       }
-
-       const completeThinkMatch = content.match(/<think>([\s\S]*?)<\/think>/);
-       const completeThinkBracketMatch = content.match(/\[THINK\]([\s\S]*?)\[\/THINK\]/);
-
-       if (completeThinkMatch) {
-               const thinkingContent = completeThinkMatch[1]?.trim() ?? '';
-               const cleanContent = `${content.slice(0, completeThinkMatch.index ?? 0)}${content.slice(
-                       (completeThinkMatch.index ?? 0) + completeThinkMatch[0].length
-               )}`.trim();
-
-               return {
-                       thinking: thinkingContent,
-                       cleanContent
-               };
-       }
-
-       if (completeThinkBracketMatch) {
-               const thinkingContent = completeThinkBracketMatch[1]?.trim() ?? '';
-               const cleanContent = `${content.slice(0, completeThinkBracketMatch.index ?? 0)}${content.slice(
-                       (completeThinkBracketMatch.index ?? 0) + completeThinkBracketMatch[0].length
-               )}`.trim();
-
-               return {
-                       thinking: thinkingContent,
-                       cleanContent
-               };
-       }
-
-       return {
-               thinking: null,
-               cleanContent: content
-       };
-}
-
-/**
- * Checks if content contains an opening thinking tag (for streaming)
- * Supports both <think> and [THINK] formats
- * @param content - The message content to check
- * @returns True if the content contains an opening thinking tag
- */
-export function hasThinkingStart(content: string): boolean {
-       return (
-               content.includes('<think>') ||
-               content.includes('[THINK]') ||
-               content.includes('<|channel|>analysis')
-       );
-}
-
-/**
- * Checks if content contains a closing thinking tag (for streaming)
- * Supports both </think> and [/THINK] formats
- * @param content - The message content to check
- * @returns True if the content contains a closing thinking tag
- */
-export function hasThinkingEnd(content: string): boolean {
-       return content.includes('</think>') || content.includes('[/THINK]');
-}
-
-/**
- * Extracts partial thinking content during streaming
- * Supports both <think> and [THINK] formats
- * Used when we have opening tag but not yet closing tag
- * @param content - The message content to extract partial thinking from
- * @returns An object containing the extracted partial thinking content and the remaining content
- */
-export function extractPartialThinking(content: string): {
-       thinking: string | null;
-       remainingContent: string;
-} {
-       const thinkStartIndex = content.indexOf('<think>');
-       const thinkEndIndex = content.indexOf('</think>');
-
-       const bracketStartIndex = content.indexOf('[THINK]');
-       const bracketEndIndex = content.indexOf('[/THINK]');
-
-       const useThinkFormat =
-               thinkStartIndex !== -1 && (bracketStartIndex === -1 || thinkStartIndex < bracketStartIndex);
-       const useBracketFormat =
-               bracketStartIndex !== -1 && (thinkStartIndex === -1 || bracketStartIndex < thinkStartIndex);
-
-       if (useThinkFormat) {
-               if (thinkEndIndex === -1) {
-                       const thinkingStart = thinkStartIndex + '<think>'.length;
-
-                       return {
-                               thinking: content.substring(thinkingStart),
-                               remainingContent: content.substring(0, thinkStartIndex)
-                       };
-               }
-       } else if (useBracketFormat) {
-               if (bracketEndIndex === -1) {
-                       const thinkingStart = bracketStartIndex + '[THINK]'.length;
-
-                       return {
-                               thinking: content.substring(thinkingStart),
-                               remainingContent: content.substring(0, bracketStartIndex)
-                       };
-               }
-       } else {
-               return { thinking: null, remainingContent: content };
-       }
-
-       const parsed = parseThinkingContent(content);
-
-       return {
-               thinking: parsed.thinking,
-               remainingContent: parsed.cleanContent
-       };
-}
diff --git a/tools/server/webui/src/stories/ChatMessage.stories.svelte b/tools/server/webui/src/stories/ChatMessage.stories.svelte

index c6377e23cb6fd314d32e3a2fad29d50a890db0e1..6529b75a307cd266691753337c67f72ff37d3879 100644 (file)
--- a/tools/server/webui/src/stories/ChatMessage.stories.svelte
+++ b/tools/server/webui/src/stories/ChatMessage.stories.svelte
@@ -36,64 +36,36 @@
                 children: []
         };
  
-       let processingMessage = $state({
-               id: '4',
-               convId: 'conv-1',
-               type: 'message',
-               timestamp: 0, // No timestamp = processing
-               role: 'assistant',
-               content: '',
-               parent: '1',
-               thinking: '',
-               children: []
-       });
-
-       let streamingMessage = $state({
-               id: '5',
-               convId: 'conv-1',
-               type: 'message',
-               timestamp: 0, // No timestamp = streaming
-               role: 'assistant',
-               content: '',
-               parent: '1',
-               thinking: '',
-               children: []
-       });
-
-       // Message with <think> format thinking content
-       const thinkTagMessage: DatabaseMessage = {
-               id: '6',
+       const assistantWithReasoning: DatabaseMessage = {
+               id: '3',
                 convId: 'conv-1',
                 type: 'message',
                 timestamp: Date.now() - 1000 * 60 * 2,
                 role: 'assistant',
-               content:
-                       "<think>\nLet me analyze this step by step:\n\n1. The user is asking about thinking formats\n2. I need to demonstrate the &lt;think&gt; tag format\n3. This content should be displayed in the thinking section\n4. The main response should be separate\n\nThis is a good example of reasoning content.\n</think>\n\nHere's my response after thinking through the problem. The thinking content above should be displayed separately from this main response content.",
+               content: "Here's the concise answer, now that I've thought it through carefully for you.",
                 parent: '1',
-               thinking: '',
+               thinking:
+                       "Let's consider the user's question step by step:\\n\\n1. Identify the core problem\\n2. Evaluate relevant information\\n3. Formulate a clear answer\\n\\nFollowing this process ensures the final response stays focused and accurate.",
                 children: []
         };
-
-       // Message with [THINK] format thinking content
-       const thinkBracketMessage: DatabaseMessage = {
-               id: '7',
+       const rawOutputMessage: DatabaseMessage = {
+               id: '6',
                 convId: 'conv-1',
                 type: 'message',
-               timestamp: Date.now() - 1000 * 60 * 1,
+               timestamp: Date.now() - 1000 * 60,
                 role: 'assistant',
                 content:
-                       '[THINK]\nThis is the DeepSeek-style thinking format:\n\n- Using square brackets instead of angle brackets\n- Should work identically to the &lt;think&gt; format\n- Content parsing should extract this reasoning\n- Display should be the same as &lt;think&gt; format\n\nBoth formats should be supported seamlessly.\n[/THINK]\n\nThis is the main response content that comes after the [THINK] block. The reasoning above should be parsed and displayed in the thinking section.',
+                       '<|channel|>analysis<|message|>User greeted me. Initiating overcomplicated analysis: Is this a trap? No, just a normal hello. Respond calmly, act like a helpful assistant, and do not start explaining quantum physics again. Confidence 0.73. Engaging socially acceptable greeting protocol...<|end|>Hello there! How can I help you today?',
                 parent: '1',
                 thinking: '',
                 children: []
         };
  
-       // Streaming message for <think> format
-       let streamingThinkMessage = $state({
-               id: '8',
+       let processingMessage = $state({
+               id: '4',
                 convId: 'conv-1',
                 type: 'message',
-               timestamp: 0, // No timestamp = streaming
+               timestamp: 0, // No timestamp = processing
                 role: 'assistant',
                 content: '',
                 parent: '1',
@@ -101,9 +73,8 @@
                 children: []
         });
  
-       // Streaming message for [THINK] format
-       let streamingBracketMessage = $state({
-               id: '9',
+       let streamingMessage = $state({
+               id: '5',
                 convId: 'conv-1',
                 type: 'message',
                 timestamp: 0, // No timestamp = streaming
@@ -120,6 +91,10 @@
         args={{
                 message: userMessage
         }}
+       play={async () => {
+               const { updateConfig } = await import('$lib/stores/settings.svelte');
+               updateConfig('disableReasoningFormat', false);
+       }}
  />
  
  <Story
@@ -128,15 +103,45 @@
                 class: 'max-w-[56rem] w-[calc(100vw-2rem)]',
                 message: assistantMessage
         }}
+       play={async () => {
+               const { updateConfig } = await import('$lib/stores/settings.svelte');
+               updateConfig('disableReasoningFormat', false);
+       }}
+/>
+
+<Story
+       name="AssistantWithReasoning"
+       args={{
+               class: 'max-w-[56rem] w-[calc(100vw-2rem)]',
+               message: assistantWithReasoning
+       }}
+       play={async () => {
+               const { updateConfig } = await import('$lib/stores/settings.svelte');
+               updateConfig('disableReasoningFormat', false);
+       }}
+/>
+
+<Story
+       name="RawLlmOutput"
+       args={{
+               class: 'max-w-[56rem] w-[calc(100vw-2rem)]',
+               message: rawOutputMessage
+       }}
+       play={async () => {
+               const { updateConfig } = await import('$lib/stores/settings.svelte');
+               updateConfig('disableReasoningFormat', true);
+       }}
  />
  
  <Story
-       name="WithThinkingBlock"
+       name="WithReasoningContent"
         args={{
                 message: streamingMessage
         }}
         asChild
         play={async () => {
+               const { updateConfig } = await import('$lib/stores/settings.svelte');
+               updateConfig('disableReasoningFormat', false);
                 // Phase 1: Stream reasoning content in chunks
                 let reasoningText =
                         'I need to think about this carefully. Let me break down the problem:\n\n1. The user is asking for help with something complex\n2. I should provide a thorough and helpful response\n3. I need to consider multiple approaches\n4. The best solution would be to explain step by step\n\nThis approach will ensure clarity and understanding.';
@@ -187,126 +192,16 @@
                 message: processingMessage
         }}
         play={async () => {
+               const { updateConfig } = await import('$lib/stores/settings.svelte');
+               updateConfig('disableReasoningFormat', false);
                 // Import the chat store to simulate loading state
                 const { chatStore } = await import('$lib/stores/chat.svelte');
-               
+
                 // Set loading state to true to trigger the processing UI
                 chatStore.isLoading = true;
-               
+
                 // Simulate the processing state hook behavior
                 // This will show the "Generating..." text and parameter details
-               await new Promise(resolve => setTimeout(resolve, 100));
-       }}
-/>
-
-<Story
-       name="ThinkTagFormat"
-       args={{
-               class: 'max-w-[56rem] w-[calc(100vw-2rem)]',
-               message: thinkTagMessage
+               await new Promise((resolve) => setTimeout(resolve, 100));
         }}
  />
-
-<Story
-       name="ThinkBracketFormat"
-       args={{
-               class: 'max-w-[56rem] w-[calc(100vw-2rem)]',
-               message: thinkBracketMessage
-       }}
-/>
-
-<Story
-       name="StreamingThinkTag"
-       args={{
-               message: streamingThinkMessage
-       }}
-       parameters={{
-               test: {
-                       timeout: 30000
-               }
-       }}
-       asChild
-       play={async () => {
-               // Phase 1: Stream <think> reasoning content
-               const thinkingContent =
-                       'Let me work through this problem systematically:\n\n1. First, I need to understand what the user is asking\n2. Then I should consider different approaches\n3. I need to evaluate the pros and cons\n4. Finally, I should provide a clear recommendation\n\nThis step-by-step approach will ensure accuracy.';
-
-               let currentContent = '<think>\n';
-               streamingThinkMessage.content = currentContent;
-
-               for (let i = 0; i < thinkingContent.length; i++) {
-                       currentContent += thinkingContent[i];
-                       streamingThinkMessage.content = currentContent;
-                       await new Promise((resolve) => setTimeout(resolve, 5));
-               }
-
-               // Close the thinking block
-               currentContent += '\n</think>\n\n';
-               streamingThinkMessage.content = currentContent;
-               await new Promise((resolve) => setTimeout(resolve, 200));
-
-               // Phase 2: Stream main response content
-               const responseContent =
-                       "Based on my analysis above, here's the solution:\n\n**Key Points:**\n- The approach should be systematic\n- We need to consider all factors\n- Implementation should be step-by-step\n\nThis ensures the best possible outcome.";
-
-               for (let i = 0; i < responseContent.length; i++) {
-                       currentContent += responseContent[i];
-                       streamingThinkMessage.content = currentContent;
-                       await new Promise((resolve) => setTimeout(resolve, 10));
-               }
-
-               streamingThinkMessage.timestamp = Date.now();
-       }}
->
-       <div class="w-[56rem]">
-               <ChatMessage message={streamingThinkMessage} />
-       </div>
-</Story>
-
-<Story
-       name="StreamingThinkBracket"
-       args={{
-               message: streamingBracketMessage
-       }}
-       parameters={{
-               test: {
-                       timeout: 30000
-               }
-       }}
-       asChild
-       play={async () => {
-               // Phase 1: Stream [THINK] reasoning content
-               const thinkingContent =
-                       'Using the DeepSeek format now:\n\n- This demonstrates the &#91;THINK&#93; bracket format\n- Should parse identically to &lt;think&gt; tags\n- The UI should display this in the thinking section\n- Main content should be separate\n\nBoth formats provide the same functionality.';
-
-               let currentContent = '[THINK]\n';
-               streamingBracketMessage.content = currentContent;
-
-               for (let i = 0; i < thinkingContent.length; i++) {
-                       currentContent += thinkingContent[i];
-                       streamingBracketMessage.content = currentContent;
-                       await new Promise((resolve) => setTimeout(resolve, 5));
-               }
-
-               // Close the thinking block
-               currentContent += '\n[/THINK]\n\n';
-               streamingBracketMessage.content = currentContent;
-               await new Promise((resolve) => setTimeout(resolve, 200));
-
-               // Phase 2: Stream main response content
-               const responseContent =
-                       "Here's my response after using the &#91;THINK&#93; format:\n\n**Observations:**\n- Both &lt;think&gt; and &#91;THINK&#93; formats work seamlessly\n- The parsing logic handles both cases\n- UI display is consistent across formats\n\nThis demonstrates the enhanced thinking content support.";
-
-               for (let i = 0; i < responseContent.length; i++) {
-                       currentContent += responseContent[i];
-                       streamingBracketMessage.content = currentContent;
-                       await new Promise((resolve) => setTimeout(resolve, 10));
-               }
-
-               streamingBracketMessage.timestamp = Date.now();
-       }}
->
-       <div class="w-[56rem]">
-               <ChatMessage message={streamingBracketMessage} />
-       </div>
-</Story>
author	Pascal <redacted>
	Wed, 8 Oct 2025 20:18:41 +0000 (22:18 +0200)
committer	GitHub <redacted>
	Wed, 8 Oct 2025 20:18:41 +0000 (23:18 +0300)
common/arg.cpp		patch \| blob \| history
common/chat-parser.cpp		patch \| blob \| history
common/chat.cpp		patch \| blob \| history
common/common.h		patch \| blob \| history
tests/test-chat-parser.cpp		patch \| blob \| history
tools/server/README.md		patch \| blob \| history
tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte		patch \| blob \| history
tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte		patch \| blob \| history
tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsDialog.svelte		patch \| blob \| history
tools/server/webui/src/lib/constants/settings-config.ts		patch \| blob \| history
tools/server/webui/src/lib/services/chat.ts		patch \| blob \| history
tools/server/webui/src/lib/stores/chat.svelte.ts		patch \| blob \| history
tools/server/webui/src/lib/utils/thinking.ts	[deleted file]	patch \| blob \| history
tools/server/webui/src/stories/ChatMessage.stories.svelte		patch \| blob \| history