]> git.djapps.eu Git - pkg/ggml/sources/llama.cpp/commitdiff
refactor: centralize CoT parsing in backend for streaming mode (#16394)
authorPascal <redacted>
Wed, 8 Oct 2025 20:18:41 +0000 (22:18 +0200)
committerGitHub <redacted>
Wed, 8 Oct 2025 20:18:41 +0000 (23:18 +0300)
* refactor: unify reasoning handling via backend reasoning_content, drop frontend tag parsing

- Updated the chat message component to surface backend-supplied reasoning via message.thinking while showing the raw assistant content without inline tag scrubbing
- Simplified chat streaming to append content chunks directly, stream reasoning into the message model, and persist any partial reasoning when generation stops
- Refactored the chat service SSE handler to rely on server-provided reasoning_content, removing legacy <think> parsing logic
- Refreshed Storybook data and streaming flows to populate the thinking field explicitly for static and streaming assistant messages

* refactor: implement streaming-aware universal reasoning parser

Remove the streaming mode limitation from --reasoning-format by refactoring
try_parse_reasoning() to handle incremental parsing of <think> tags across
all formats.

- Rework try_parse_reasoning() to track whitespace, partial tags, and
  multiple reasoning segments, allowing proper separation of reasoning_content
  and content in streaming mode
- Parse reasoning tags before tool call handling in content-only and Llama 3.x
  formats to ensure inline <think> blocks are captured correctly
- Change default reasoning_format from 'auto' to 'deepseek' for consistent
  behavior
- Add 'deepseek-legacy' option to preserve old inline behavior when needed
- Update CLI help and documentation to reflect streaming support
- Add parser tests for inline <think>...</think> segments

The parser now continues processing content after </think> closes instead of
stopping, enabling proper message.reasoning_content and message.content
separation in both streaming and non-streaming modes.

Fixes the issue where streaming responses would dump everything (including
post-thinking content) into reasoning_content while leaving content empty.

* refactor: address review feedback from allozaur

- Passed the assistant message content directly to ChatMessageAssistant to drop the redundant derived state in the chat message component
- Simplified chat streaming updates by removing unused partial-thinking handling and persisting partial responses straight from currentResponse
- Refreshed the ChatMessage stories to cover standard and reasoning scenarios without the old THINK-tag parsing examples

Co-authored-by: Aleksander Grygier <redacted>
* refactor: restore forced reasoning prefix to pass test-chat ([chat] All tests passed)

- store the exact sequence seen on input when 'thinking_forced_open' enforces a reasoning block
- inject this prefix before the first accumulated segment in 'reasoning_content', then clear it to avoid duplication
- repeat the capture on every new 'start_think' detection to properly handle partial/streaming flows

* refactor: address review feedback from ngxson

* debug: say goodbye to curl -N, hello one-click raw stream

- adds a new checkbox in the WebUI to display raw LLM output without backend parsing or frontend Markdown rendering

* Update tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte

Co-authored-by: Aleksander Grygier <redacted>
* webui: add Storybook example for raw LLM output and scope reasoning format toggle per story

- Added a Storybook example that showcases the chat message component in raw LLM output mode with the provided trace sample
- Updated every ChatMessage story to toggle the disableReasoningFormat setting so the raw-output rendering remains scoped to its own example

* npm run format

* chat-parser: address review feedback from ngxson

Co-authored-by: Xuan Son Nguyen <redacted>
---------

Co-authored-by: Aleksander Grygier <redacted>
Co-authored-by: Xuan Son Nguyen <redacted>
14 files changed:
common/arg.cpp
common/chat-parser.cpp
common/chat.cpp
common/common.h
tests/test-chat-parser.cpp
tools/server/README.md
tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte
tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte
tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsDialog.svelte
tools/server/webui/src/lib/constants/settings-config.ts
tools/server/webui/src/lib/services/chat.ts
tools/server/webui/src/lib/stores/chat.svelte.ts
tools/server/webui/src/lib/utils/thinking.ts [deleted file]
tools/server/webui/src/stories/ChatMessage.stories.svelte

index ecc296485cb4748cd71d0903f2be2cbbd4ccf0c0..4204f6c6908fbd57c2007e77d8d88aea2e1c6263 100644 (file)
@@ -3432,7 +3432,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         {"--reasoning-format"}, "FORMAT",
         "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
         "- none: leaves thoughts unparsed in `message.content`\n"
-        "- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
+        "- deepseek: puts thoughts in `message.reasoning_content`\n"
+        "- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`\n"
         "(default: auto)",
         [](common_params & params, const std::string & value) {
             params.reasoning_format = common_reasoning_format_from_name(value);
index b3362519a68f3a243c51562d4ab7652ad35eddcf..7365782e7d6d8670c31ec3417562a986d559b6c9 100644 (file)
@@ -3,9 +3,12 @@
 #include "log.h"
 #include "regex-partial.h"
 
+#include <algorithm>
+#include <cctype>
 #include <optional>
 #include <stdexcept>
 #include <string>
+#include <string_view>
 #include <vector>
 
 using json = nlohmann::ordered_json;
@@ -166,6 +169,27 @@ void common_chat_msg_parser::consume_literal(const std::string & literal) {
 }
 
 bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think, const std::string & end_think) {
+    std::string pending_reasoning_prefix;
+
+    if (syntax_.reasoning_format == COMMON_REASONING_FORMAT_NONE) {
+        return false;
+    }
+
+    auto set_reasoning_prefix = [&](size_t prefix_pos) {
+        if (!syntax_.thinking_forced_open || syntax_.reasoning_in_content) {
+            return;
+        }
+        if (prefix_pos + start_think.size() > input_.size()) {
+            pending_reasoning_prefix.clear();
+            return;
+        }
+        // Capture the exact literal that opened the reasoning section so we can
+        // surface it back to callers. This ensures formats that force the
+        // reasoning tag open (e.g. DeepSeek R1) retain their original prefix
+        // instead of dropping it during parsing.
+        pending_reasoning_prefix = input_.substr(prefix_pos, start_think.size());
+    };
+
     auto handle_reasoning = [&](const std::string & reasoning, bool closed) {
         auto stripped_reasoning = string_strip(reasoning);
         if (stripped_reasoning.empty()) {
@@ -178,28 +202,116 @@ bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think
                 add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "</think>" : end_think);
             }
         } else {
+            if (!pending_reasoning_prefix.empty()) {
+                add_reasoning_content(pending_reasoning_prefix);
+                pending_reasoning_prefix.clear();
+            }
             add_reasoning_content(stripped_reasoning);
         }
     };
-    if (syntax_.reasoning_format != COMMON_REASONING_FORMAT_NONE) {
-        if (syntax_.thinking_forced_open || try_consume_literal(start_think)) {
-            if (auto res = try_find_literal(end_think)) {
-                handle_reasoning(res->prelude, /* closed */ true);
-                consume_spaces();
-                return true;
-            }
-            auto rest = consume_rest();
+
+    const size_t saved_pos = pos_;
+    const size_t saved_content_size = result_.content.size();
+    const size_t saved_reasoning_size = result_.reasoning_content.size();
+
+    auto restore_state = [&]() {
+        move_to(saved_pos);
+        result_.content.resize(saved_content_size);
+        result_.reasoning_content.resize(saved_reasoning_size);
+    };
+
+    // Allow leading whitespace to be preserved as content when reasoning is present at the start
+    size_t cursor = pos_;
+    size_t whitespace_end = cursor;
+    while (whitespace_end < input_.size() && std::isspace(static_cast<unsigned char>(input_[whitespace_end]))) {
+        ++whitespace_end;
+    }
+
+    if (whitespace_end >= input_.size()) {
+        restore_state();
+        if (syntax_.thinking_forced_open) {
+            auto rest = input_.substr(saved_pos);
             if (!rest.empty()) {
                 handle_reasoning(rest, /* closed */ !is_partial());
             }
-            // Allow unclosed thinking tags, for now (https://github.com/ggml-org/llama.cpp/issues/13812, https://github.com/ggml-org/llama.cpp/issues/13877)
-            // if (!syntax_.thinking_forced_open) {
-            //     throw common_chat_msg_partial_exception(end_think);
-            // }
+            move_to(input_.size());
             return true;
         }
+        return false;
+    }
+
+    cursor = whitespace_end;
+    const size_t remaining = input_.size() - cursor;
+    const size_t start_prefix = std::min(start_think.size(), remaining);
+    const bool has_start_tag = input_.compare(cursor, start_prefix, start_think, 0, start_prefix) == 0;
+
+    if (has_start_tag && start_prefix < start_think.size()) {
+        move_to(input_.size());
+        return true;
+    }
+
+    if (has_start_tag) {
+        if (whitespace_end > pos_) {
+            add_content(input_.substr(pos_, whitespace_end - pos_));
+        }
+        set_reasoning_prefix(cursor);
+        cursor += start_think.size();
+    } else if (syntax_.thinking_forced_open) {
+        cursor = whitespace_end;
+    } else {
+        restore_state();
+        return false;
+    }
+    while (true) {
+        if (cursor >= input_.size()) {
+            move_to(input_.size());
+            return true;
+        }
+
+        size_t end_pos = input_.find(end_think, cursor);
+        if (end_pos == std::string::npos) {
+            std::string_view remaining_view(input_.data() + cursor, input_.size() - cursor);
+            size_t partial_off = string_find_partial_stop(remaining_view, end_think);
+            size_t reasoning_end = partial_off == std::string::npos ? input_.size() : cursor + partial_off;
+            if (reasoning_end > cursor) {
+                handle_reasoning(input_.substr(cursor, reasoning_end - cursor), /* closed */ partial_off == std::string::npos && !is_partial());
+            }
+            move_to(input_.size());
+            return true;
+        }
+
+        if (end_pos > cursor) {
+            handle_reasoning(input_.substr(cursor, end_pos - cursor), /* closed */ true);
+        } else {
+            handle_reasoning("", /* closed */ true);
+        }
+
+        cursor = end_pos + end_think.size();
+
+        while (cursor < input_.size() && std::isspace(static_cast<unsigned char>(input_[cursor]))) {
+            ++cursor;
+        }
+
+        const size_t next_remaining = input_.size() - cursor;
+        if (next_remaining == 0) {
+            move_to(cursor);
+            return true;
+        }
+
+        const size_t next_prefix = std::min(start_think.size(), next_remaining);
+        if (input_.compare(cursor, next_prefix, start_think, 0, next_prefix) == 0) {
+            if (next_prefix < start_think.size()) {
+                move_to(input_.size());
+                return true;
+            }
+            set_reasoning_prefix(cursor);
+            cursor += start_think.size();
+            continue;
+        }
+
+        move_to(cursor);
+        return true;
     }
-    return false;
 }
 
 std::string common_chat_msg_parser::consume_rest() {
index afbb2a2bdd3c41233da3112127da473980d897d3..8587140e1ff0ae78a4bed7238a75a94fba3ba8ab 100644 (file)
@@ -1408,6 +1408,8 @@ static common_chat_params common_chat_params_init_apertus(const common_chat_temp
     return data;
 }
 static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool with_builtin_tools = false) {
+    builder.try_parse_reasoning("<think>", "</think>");
+
     if (!builder.syntax().parse_tool_calls) {
         builder.add_content(builder.consume_rest());
         return;
@@ -2862,6 +2864,7 @@ common_chat_params common_chat_templates_apply(
 }
 
 static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
+    builder.try_parse_reasoning("<think>", "</think>");
     builder.add_content(builder.consume_rest());
 }
 
index 8a8ecd667f2cc7375e9febee5c761cf19cd8a1c0..0d3638c9c6228c3fce31aba1c8e22a8ea7645d86 100644 (file)
@@ -433,7 +433,7 @@ struct common_params {
     std::string chat_template = "";                                                                         // NOLINT
     bool use_jinja = false;                                                                                 // NOLINT
     bool enable_chat_template = true;
-    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO;
+    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
     int reasoning_budget = -1;
     bool prefill_assistant = true;                                                                          // if true, any trailing assistant message will be prefilled into the response
 
index 547ebb4871cd43ca5736bea446ee9111a9bf2c6a..0b275befb8bf4c6837ad2dc644f111d7b45ae006 100644 (file)
@@ -106,6 +106,34 @@ static void test_reasoning() {
     assert_equals("<think>Cogito</think>", builder.result().content);
     assert_equals("Ergo sum", builder.consume_rest());
   }
+  {
+    const std::string variant("content_only_inline_think");
+    common_chat_syntax syntax = {
+        /* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY,
+        /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+        /* .reasoning_in_content = */ false,
+        /* .thinking_forced_open = */ false,
+        /* .parse_tool_calls = */ false,
+    };
+    const std::string input = "<think>Pense</think>Bonjour";
+    auto msg = common_chat_parse(input, false, syntax);
+    assert_equals(variant, std::string("Pense"), msg.reasoning_content);
+    assert_equals(variant, std::string("Bonjour"), msg.content);
+  }
+  {
+    const std::string variant("llama_3_inline_think");
+    common_chat_syntax syntax = {
+        /* .format = */ COMMON_CHAT_FORMAT_LLAMA_3_X,
+        /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+        /* .reasoning_in_content = */ false,
+        /* .thinking_forced_open = */ false,
+        /* .parse_tool_calls = */ false,
+    };
+    const std::string input = "<think>Plan</think>RĂ©ponse";
+    auto msg = common_chat_parse(input, false, syntax);
+    assert_equals(variant, std::string("Plan"), msg.reasoning_content);
+    assert_equals(variant, std::string("RĂ©ponse"), msg.content);
+  }
   // Test DeepSeek V3.1 parsing - reasoning content followed by "</think>" and then regular content
   {
     common_chat_syntax syntax = {
index e23b122ab148ccec3905e1547b25a71d05199010..f5ab9236d52167e34cebf869f78b1ba67eb17ec8 100644 (file)
@@ -190,7 +190,7 @@ The project is under active development, and we are [looking for feedback and co
 | `--no-slots` | disables slots monitoring endpoint<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
 | `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
 | `--jinja` | use jinja template for chat (default: disabled)<br/>(env: LLAMA_ARG_JINJA) |
-| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
+| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: deepseek)<br/>(env: LLAMA_ARG_THINK) |
 | `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
 | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
 | `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
index c923bf9e040deb3f133189bb07813c2ce9f90dd0..fed0cf712695ffeb336965753b2747a6e4fa1cee 100644 (file)
@@ -1,7 +1,6 @@
 <script lang="ts">
        import { getDeletionInfo } from '$lib/stores/chat.svelte';
        import { copyToClipboard } from '$lib/utils/copy';
-       import { parseThinkingContent } from '$lib/utils/thinking';
        import ChatMessageAssistant from './ChatMessageAssistant.svelte';
        import ChatMessageUser from './ChatMessageUser.svelte';
 
 
        let thinkingContent = $derived.by(() => {
                if (message.role === 'assistant') {
-                       if (message.thinking) {
-                               return message.thinking;
-                       }
-
-                       const parsed = parseThinkingContent(message.content);
+                       const trimmedThinking = message.thinking?.trim();
 
-                       return parsed.thinking;
+                       return trimmedThinking ? trimmedThinking : null;
                }
                return null;
        });
 
-       let messageContent = $derived.by(() => {
-               if (message.role === 'assistant') {
-                       const parsed = parseThinkingContent(message.content);
-                       return parsed.cleanContent?.replace('<|channel|>analysis', '');
-               }
-
-               return message.content?.replace('<|channel|>analysis', '');
-       });
-
        function handleCancelEdit() {
                isEditing = false;
                editedContent = message.content;
                {editedContent}
                {isEditing}
                {message}
-               {messageContent}
+               messageContent={message.content}
                onCancelEdit={handleCancelEdit}
                onConfirmDelete={handleConfirmDelete}
                onCopy={handleCopy}
index ad3ffa3792d5888959d2c0d0927d7033bc6dbffb..5539ed9e21c349a1fd232c46ceda8cede797f3ce 100644 (file)
                        </div>
                </div>
        {:else if message.role === 'assistant'}
-               <MarkdownContent content={messageContent || ''} />
+               {#if config().disableReasoningFormat}
+                       <pre class="raw-output">{messageContent || ''}</pre>
+               {:else}
+                       <MarkdownContent content={messageContent || ''} />
+               {/if}
        {:else}
                <div class="text-sm whitespace-pre-wrap">
                        {messageContent}
                        background-position: -200% 0;
                }
        }
+
+       .raw-output {
+               width: 100%;
+               max-width: 48rem;
+               margin-top: 1.5rem;
+               padding: 1rem 1.25rem;
+               border-radius: 1rem;
+               background: hsl(var(--muted) / 0.3);
+               color: var(--foreground);
+               font-family:
+                       ui-monospace, SFMono-Regular, 'SF Mono', Monaco, 'Cascadia Code', 'Roboto Mono', Consolas,
+                       'Liberation Mono', Menlo, monospace;
+               font-size: 0.875rem;
+               line-height: 1.6;
+               white-space: pre-wrap;
+               word-break: break-word;
+       }
 </style>
index d832abc2e2831c53e760e4fccf65d9d3c9d75e10..7c25e5925882961dfd5bef903004fe355200d941 100644 (file)
                                        key: 'showThoughtInProgress',
                                        label: 'Show thought in progress',
                                        type: 'checkbox'
+                               },
+                               {
+                                       key: 'disableReasoningFormat',
+                                       label:
+                                               'Show raw LLM output without backend parsing and frontend Markdown rendering to inspect streaming across different models.',
+                                       type: 'checkbox'
                                }
                        ]
                },
index 5cb22f4c34082f6f8f5ef97f33d52b908ee34918..63e4364ae5656a4809d4d5a893f9e2edabb2474d 100644 (file)
@@ -6,6 +6,7 @@ export const SETTING_CONFIG_DEFAULT: Record<string, string | number | boolean> =
        theme: 'system',
        showTokensPerSecond: false,
        showThoughtInProgress: false,
+       disableReasoningFormat: false,
        keepStatsVisible: false,
        askForTitleConfirmation: false,
        pasteLongTextToFileLen: 2500,
@@ -76,6 +77,8 @@ export const SETTING_CONFIG_INFO: Record<string, string> = {
        custom: 'Custom JSON parameters to send to the API. Must be valid JSON format.',
        showTokensPerSecond: 'Display generation speed in tokens per second during streaming.',
        showThoughtInProgress: 'Expand thought process by default when generating messages.',
+       disableReasoningFormat:
+               'Show raw LLM output without backend parsing and frontend Markdown rendering to inspect streaming across different models.',
        keepStatsVisible: 'Keep processing statistics visible after generation finishes.',
        askForTitleConfirmation:
                'Ask for confirmation before automatically changing conversation title when editing the first message.',
index 369cdf4e8b9359e09c6aea7cac6aa338ecec67d9..e1dad24a8f2460e52728970188a7da9581f3bc16 100644 (file)
@@ -78,6 +78,8 @@ export class ChatService {
                        timings_per_token
                } = options;
 
+               const currentConfig = config();
+
                // Cancel any ongoing request and create a new abort controller
                this.abort();
                this.abortController = new AbortController();
@@ -117,7 +119,7 @@ export class ChatService {
                        stream
                };
 
-               requestBody.reasoning_format = 'auto';
+               requestBody.reasoning_format = currentConfig.disableReasoningFormat ? 'none' : 'auto';
 
                if (temperature !== undefined) requestBody.temperature = temperature;
                // Set max_tokens to -1 (infinite) if not provided or empty
@@ -161,7 +163,6 @@ export class ChatService {
                }
 
                try {
-                       const currentConfig = config();
                        const apiKey = currentConfig.apiKey?.toString().trim();
 
                        const response = await fetch(`./v1/chat/completions`, {
@@ -256,10 +257,8 @@ export class ChatService {
                }
 
                const decoder = new TextDecoder();
-               let fullResponse = '';
+               let aggregatedContent = '';
                let fullReasoningContent = '';
-               let regularContent = '';
-               let insideThinkTag = false;
                let hasReceivedData = false;
                let lastTimings: ChatMessageTimings | undefined;
 
@@ -277,7 +276,7 @@ export class ChatService {
                                        if (line.startsWith('data: ')) {
                                                const data = line.slice(6);
                                                if (data === '[DONE]') {
-                                                       if (!hasReceivedData && fullResponse.length === 0) {
+                                                       if (!hasReceivedData && aggregatedContent.length === 0) {
                                                                const contextError = new Error(
                                                                        'The request exceeds the available context size. Try increasing the context size or enable context shift.'
                                                                );
@@ -286,7 +285,7 @@ export class ChatService {
                                                                return;
                                                        }
 
-                                                       onComplete?.(regularContent, fullReasoningContent || undefined, lastTimings);
+                                                       onComplete?.(aggregatedContent, fullReasoningContent || undefined, lastTimings);
 
                                                        return;
                                                }
@@ -310,27 +309,8 @@ export class ChatService {
 
                                                        if (content) {
                                                                hasReceivedData = true;
-                                                               fullResponse += content;
-
-                                                               // Track the regular content before processing this chunk
-                                                               const regularContentBefore = regularContent;
-
-                                                               // Process content character by character to handle think tags
-                                                               insideThinkTag = this.processContentForThinkTags(
-                                                                       content,
-                                                                       insideThinkTag,
-                                                                       () => {
-                                                                               // Think content is ignored - we don't include it in API requests
-                                                                       },
-                                                                       (regularChunk) => {
-                                                                               regularContent += regularChunk;
-                                                                       }
-                                                               );
-
-                                                               const newRegularContent = regularContent.slice(regularContentBefore.length);
-                                                               if (newRegularContent) {
-                                                                       onChunk?.(newRegularContent);
-                                                               }
+                                                               aggregatedContent += content;
+                                                               onChunk?.(content);
                                                        }
 
                                                        if (reasoningContent) {
@@ -345,7 +325,7 @@ export class ChatService {
                                }
                        }
 
-                       if (!hasReceivedData && fullResponse.length === 0) {
+                       if (!hasReceivedData && aggregatedContent.length === 0) {
                                const contextError = new Error(
                                        'The request exceeds the available context size. Try increasing the context size or enable context shift.'
                                );
@@ -552,51 +532,6 @@ export class ChatService {
                }
        }
 
-       /**
-        * Processes content to separate thinking tags from regular content.
-        * Parses <think> and </think> tags to route content to appropriate handlers.
-        *
-        * @param content - The content string to process
-        * @param currentInsideThinkTag - Current state of whether we're inside a think tag
-        * @param addThinkContent - Callback to handle content inside think tags
-        * @param addRegularContent - Callback to handle regular content outside think tags
-        * @returns Boolean indicating if we're still inside a think tag after processing
-        * @private
-        */
-       private processContentForThinkTags(
-               content: string,
-               currentInsideThinkTag: boolean,
-               addThinkContent: (chunk: string) => void,
-               addRegularContent: (chunk: string) => void
-       ): boolean {
-               let i = 0;
-               let insideThinkTag = currentInsideThinkTag;
-
-               while (i < content.length) {
-                       if (!insideThinkTag && content.substring(i, i + 7) === '<think>') {
-                               insideThinkTag = true;
-                               i += 7; // Skip the <think> tag
-                               continue;
-                       }
-
-                       if (insideThinkTag && content.substring(i, i + 8) === '</think>') {
-                               insideThinkTag = false;
-                               i += 8; // Skip the </think> tag
-                               continue;
-                       }
-
-                       if (insideThinkTag) {
-                               addThinkContent(content[i]);
-                       } else {
-                               addRegularContent(content[i]);
-                       }
-
-                       i++;
-               }
-
-               return insideThinkTag;
-       }
-
        /**
         * Aborts any ongoing chat completion request.
         * Cancels the current request and cleans up the abort controller.
index 5bf964f9adfd465d0989c35b678ba93a4da39e11..9bac1b9d36f9bfa46046fbe320dd6b8b777cd139 100644 (file)
@@ -5,7 +5,6 @@ import { config } from '$lib/stores/settings.svelte';
 import { filterByLeafNodeId, findLeafNode, findDescendantMessages } from '$lib/utils/branching';
 import { browser } from '$app/environment';
 import { goto } from '$app/navigation';
-import { extractPartialThinking } from '$lib/utils/thinking';
 import { toast } from 'svelte-sonner';
 import type { ExportedConversations } from '$lib/types/database';
 
@@ -344,11 +343,9 @@ class ChatStore {
                                this.currentResponse = streamedContent;
 
                                captureModelIfNeeded();
-
-                               const partialThinking = extractPartialThinking(streamedContent);
                                const messageIndex = this.findMessageIndex(assistantMessage.id);
                                this.updateMessageAtIndex(messageIndex, {
-                                       content: partialThinking.remainingContent || streamedContent
+                                       content: streamedContent
                                });
                        },
 
@@ -696,18 +693,16 @@ class ChatStore {
 
                if (lastMessage && lastMessage.role === 'assistant') {
                        try {
-                               const partialThinking = extractPartialThinking(this.currentResponse);
-
                                const updateData: {
                                        content: string;
                                        thinking?: string;
                                        timings?: ChatMessageTimings;
                                } = {
-                                       content: partialThinking.remainingContent || this.currentResponse
+                                       content: this.currentResponse
                                };
 
-                               if (partialThinking.thinking) {
-                                       updateData.thinking = partialThinking.thinking;
+                               if (lastMessage.thinking?.trim()) {
+                                       updateData.thinking = lastMessage.thinking;
                                }
 
                                const lastKnownState = await slotsService.getCurrentState();
@@ -727,7 +722,10 @@ class ChatStore {
 
                                await DatabaseStore.updateMessage(lastMessage.id, updateData);
 
-                               lastMessage.content = partialThinking.remainingContent || this.currentResponse;
+                               lastMessage.content = this.currentResponse;
+                               if (updateData.thinking !== undefined) {
+                                       lastMessage.thinking = updateData.thinking;
+                               }
                                if (updateData.timings) {
                                        lastMessage.timings = updateData.timings;
                                }
diff --git a/tools/server/webui/src/lib/utils/thinking.ts b/tools/server/webui/src/lib/utils/thinking.ts
deleted file mode 100644 (file)
index bed13fc..0000000
+++ /dev/null
@@ -1,143 +0,0 @@
-/**
- * Parses thinking content from a message that may contain <think> tags or [THINK] tags
- * Returns an object with thinking content and cleaned message content
- * Handles both complete blocks and incomplete blocks (streaming)
- * Supports formats: <think>...</think> and [THINK]...[/THINK]
- * @param content - The message content to parse
- * @returns An object containing the extracted thinking content and the cleaned message content
- */
-export function parseThinkingContent(content: string): {
-       thinking: string | null;
-       cleanContent: string;
-} {
-       const incompleteThinkMatch = content.includes('<think>') && !content.includes('</think>');
-       const incompleteThinkBracketMatch = content.includes('[THINK]') && !content.includes('[/THINK]');
-
-       if (incompleteThinkMatch) {
-               const cleanContent = content.split('</think>')?.[1]?.trim();
-               const thinkingContent = content.split('<think>')?.[1]?.trim();
-
-               return {
-                       cleanContent,
-                       thinking: thinkingContent
-               };
-       }
-
-       if (incompleteThinkBracketMatch) {
-               const cleanContent = content.split('[/THINK]')?.[1]?.trim();
-               const thinkingContent = content.split('[THINK]')?.[1]?.trim();
-
-               return {
-                       cleanContent,
-                       thinking: thinkingContent
-               };
-       }
-
-       const completeThinkMatch = content.match(/<think>([\s\S]*?)<\/think>/);
-       const completeThinkBracketMatch = content.match(/\[THINK\]([\s\S]*?)\[\/THINK\]/);
-
-       if (completeThinkMatch) {
-               const thinkingContent = completeThinkMatch[1]?.trim() ?? '';
-               const cleanContent = `${content.slice(0, completeThinkMatch.index ?? 0)}${content.slice(
-                       (completeThinkMatch.index ?? 0) + completeThinkMatch[0].length
-               )}`.trim();
-
-               return {
-                       thinking: thinkingContent,
-                       cleanContent
-               };
-       }
-
-       if (completeThinkBracketMatch) {
-               const thinkingContent = completeThinkBracketMatch[1]?.trim() ?? '';
-               const cleanContent = `${content.slice(0, completeThinkBracketMatch.index ?? 0)}${content.slice(
-                       (completeThinkBracketMatch.index ?? 0) + completeThinkBracketMatch[0].length
-               )}`.trim();
-
-               return {
-                       thinking: thinkingContent,
-                       cleanContent
-               };
-       }
-
-       return {
-               thinking: null,
-               cleanContent: content
-       };
-}
-
-/**
- * Checks if content contains an opening thinking tag (for streaming)
- * Supports both <think> and [THINK] formats
- * @param content - The message content to check
- * @returns True if the content contains an opening thinking tag
- */
-export function hasThinkingStart(content: string): boolean {
-       return (
-               content.includes('<think>') ||
-               content.includes('[THINK]') ||
-               content.includes('<|channel|>analysis')
-       );
-}
-
-/**
- * Checks if content contains a closing thinking tag (for streaming)
- * Supports both </think> and [/THINK] formats
- * @param content - The message content to check
- * @returns True if the content contains a closing thinking tag
- */
-export function hasThinkingEnd(content: string): boolean {
-       return content.includes('</think>') || content.includes('[/THINK]');
-}
-
-/**
- * Extracts partial thinking content during streaming
- * Supports both <think> and [THINK] formats
- * Used when we have opening tag but not yet closing tag
- * @param content - The message content to extract partial thinking from
- * @returns An object containing the extracted partial thinking content and the remaining content
- */
-export function extractPartialThinking(content: string): {
-       thinking: string | null;
-       remainingContent: string;
-} {
-       const thinkStartIndex = content.indexOf('<think>');
-       const thinkEndIndex = content.indexOf('</think>');
-
-       const bracketStartIndex = content.indexOf('[THINK]');
-       const bracketEndIndex = content.indexOf('[/THINK]');
-
-       const useThinkFormat =
-               thinkStartIndex !== -1 && (bracketStartIndex === -1 || thinkStartIndex < bracketStartIndex);
-       const useBracketFormat =
-               bracketStartIndex !== -1 && (thinkStartIndex === -1 || bracketStartIndex < thinkStartIndex);
-
-       if (useThinkFormat) {
-               if (thinkEndIndex === -1) {
-                       const thinkingStart = thinkStartIndex + '<think>'.length;
-
-                       return {
-                               thinking: content.substring(thinkingStart),
-                               remainingContent: content.substring(0, thinkStartIndex)
-                       };
-               }
-       } else if (useBracketFormat) {
-               if (bracketEndIndex === -1) {
-                       const thinkingStart = bracketStartIndex + '[THINK]'.length;
-
-                       return {
-                               thinking: content.substring(thinkingStart),
-                               remainingContent: content.substring(0, bracketStartIndex)
-                       };
-               }
-       } else {
-               return { thinking: null, remainingContent: content };
-       }
-
-       const parsed = parseThinkingContent(content);
-
-       return {
-               thinking: parsed.thinking,
-               remainingContent: parsed.cleanContent
-       };
-}
index c6377e23cb6fd314d32e3a2fad29d50a890db0e1..6529b75a307cd266691753337c67f72ff37d3879 100644 (file)
                children: []
        };
 
-       let processingMessage = $state({
-               id: '4',
-               convId: 'conv-1',
-               type: 'message',
-               timestamp: 0, // No timestamp = processing
-               role: 'assistant',
-               content: '',
-               parent: '1',
-               thinking: '',
-               children: []
-       });
-
-       let streamingMessage = $state({
-               id: '5',
-               convId: 'conv-1',
-               type: 'message',
-               timestamp: 0, // No timestamp = streaming
-               role: 'assistant',
-               content: '',
-               parent: '1',
-               thinking: '',
-               children: []
-       });
-
-       // Message with <think> format thinking content
-       const thinkTagMessage: DatabaseMessage = {
-               id: '6',
+       const assistantWithReasoning: DatabaseMessage = {
+               id: '3',
                convId: 'conv-1',
                type: 'message',
                timestamp: Date.now() - 1000 * 60 * 2,
                role: 'assistant',
-               content:
-                       "<think>\nLet me analyze this step by step:\n\n1. The user is asking about thinking formats\n2. I need to demonstrate the &lt;think&gt; tag format\n3. This content should be displayed in the thinking section\n4. The main response should be separate\n\nThis is a good example of reasoning content.\n</think>\n\nHere's my response after thinking through the problem. The thinking content above should be displayed separately from this main response content.",
+               content: "Here's the concise answer, now that I've thought it through carefully for you.",
                parent: '1',
-               thinking: '',
+               thinking:
+                       "Let's consider the user's question step by step:\\n\\n1. Identify the core problem\\n2. Evaluate relevant information\\n3. Formulate a clear answer\\n\\nFollowing this process ensures the final response stays focused and accurate.",
                children: []
        };
-
-       // Message with [THINK] format thinking content
-       const thinkBracketMessage: DatabaseMessage = {
-               id: '7',
+       const rawOutputMessage: DatabaseMessage = {
+               id: '6',
                convId: 'conv-1',
                type: 'message',
-               timestamp: Date.now() - 1000 * 60 * 1,
+               timestamp: Date.now() - 1000 * 60,
                role: 'assistant',
                content:
-                       '[THINK]\nThis is the DeepSeek-style thinking format:\n\n- Using square brackets instead of angle brackets\n- Should work identically to the &lt;think&gt; format\n- Content parsing should extract this reasoning\n- Display should be the same as &lt;think&gt; format\n\nBoth formats should be supported seamlessly.\n[/THINK]\n\nThis is the main response content that comes after the [THINK] block. The reasoning above should be parsed and displayed in the thinking section.',
+                       '<|channel|>analysis<|message|>User greeted me. Initiating overcomplicated analysis: Is this a trap? No, just a normal hello. Respond calmly, act like a helpful assistant, and do not start explaining quantum physics again. Confidence 0.73. Engaging socially acceptable greeting protocol...<|end|>Hello there! How can I help you today?',
                parent: '1',
                thinking: '',
                children: []
        };
 
-       // Streaming message for <think> format
-       let streamingThinkMessage = $state({
-               id: '8',
+       let processingMessage = $state({
+               id: '4',
                convId: 'conv-1',
                type: 'message',
-               timestamp: 0, // No timestamp = streaming
+               timestamp: 0, // No timestamp = processing
                role: 'assistant',
                content: '',
                parent: '1',
                children: []
        });
 
-       // Streaming message for [THINK] format
-       let streamingBracketMessage = $state({
-               id: '9',
+       let streamingMessage = $state({
+               id: '5',
                convId: 'conv-1',
                type: 'message',
                timestamp: 0, // No timestamp = streaming
        args={{
                message: userMessage
        }}
+       play={async () => {
+               const { updateConfig } = await import('$lib/stores/settings.svelte');
+               updateConfig('disableReasoningFormat', false);
+       }}
 />
 
 <Story
                class: 'max-w-[56rem] w-[calc(100vw-2rem)]',
                message: assistantMessage
        }}
+       play={async () => {
+               const { updateConfig } = await import('$lib/stores/settings.svelte');
+               updateConfig('disableReasoningFormat', false);
+       }}
+/>
+
+<Story
+       name="AssistantWithReasoning"
+       args={{
+               class: 'max-w-[56rem] w-[calc(100vw-2rem)]',
+               message: assistantWithReasoning
+       }}
+       play={async () => {
+               const { updateConfig } = await import('$lib/stores/settings.svelte');
+               updateConfig('disableReasoningFormat', false);
+       }}
+/>
+
+<Story
+       name="RawLlmOutput"
+       args={{
+               class: 'max-w-[56rem] w-[calc(100vw-2rem)]',
+               message: rawOutputMessage
+       }}
+       play={async () => {
+               const { updateConfig } = await import('$lib/stores/settings.svelte');
+               updateConfig('disableReasoningFormat', true);
+       }}
 />
 
 <Story
-       name="WithThinkingBlock"
+       name="WithReasoningContent"
        args={{
                message: streamingMessage
        }}
        asChild
        play={async () => {
+               const { updateConfig } = await import('$lib/stores/settings.svelte');
+               updateConfig('disableReasoningFormat', false);
                // Phase 1: Stream reasoning content in chunks
                let reasoningText =
                        'I need to think about this carefully. Let me break down the problem:\n\n1. The user is asking for help with something complex\n2. I should provide a thorough and helpful response\n3. I need to consider multiple approaches\n4. The best solution would be to explain step by step\n\nThis approach will ensure clarity and understanding.';
                message: processingMessage
        }}
        play={async () => {
+               const { updateConfig } = await import('$lib/stores/settings.svelte');
+               updateConfig('disableReasoningFormat', false);
                // Import the chat store to simulate loading state
                const { chatStore } = await import('$lib/stores/chat.svelte');
-               
+
                // Set loading state to true to trigger the processing UI
                chatStore.isLoading = true;
-               
+
                // Simulate the processing state hook behavior
                // This will show the "Generating..." text and parameter details
-               await new Promise(resolve => setTimeout(resolve, 100));
-       }}
-/>
-
-<Story
-       name="ThinkTagFormat"
-       args={{
-               class: 'max-w-[56rem] w-[calc(100vw-2rem)]',
-               message: thinkTagMessage
+               await new Promise((resolve) => setTimeout(resolve, 100));
        }}
 />
-
-<Story
-       name="ThinkBracketFormat"
-       args={{
-               class: 'max-w-[56rem] w-[calc(100vw-2rem)]',
-               message: thinkBracketMessage
-       }}
-/>
-
-<Story
-       name="StreamingThinkTag"
-       args={{
-               message: streamingThinkMessage
-       }}
-       parameters={{
-               test: {
-                       timeout: 30000
-               }
-       }}
-       asChild
-       play={async () => {
-               // Phase 1: Stream <think> reasoning content
-               const thinkingContent =
-                       'Let me work through this problem systematically:\n\n1. First, I need to understand what the user is asking\n2. Then I should consider different approaches\n3. I need to evaluate the pros and cons\n4. Finally, I should provide a clear recommendation\n\nThis step-by-step approach will ensure accuracy.';
-
-               let currentContent = '<think>\n';
-               streamingThinkMessage.content = currentContent;
-
-               for (let i = 0; i < thinkingContent.length; i++) {
-                       currentContent += thinkingContent[i];
-                       streamingThinkMessage.content = currentContent;
-                       await new Promise((resolve) => setTimeout(resolve, 5));
-               }
-
-               // Close the thinking block
-               currentContent += '\n</think>\n\n';
-               streamingThinkMessage.content = currentContent;
-               await new Promise((resolve) => setTimeout(resolve, 200));
-
-               // Phase 2: Stream main response content
-               const responseContent =
-                       "Based on my analysis above, here's the solution:\n\n**Key Points:**\n- The approach should be systematic\n- We need to consider all factors\n- Implementation should be step-by-step\n\nThis ensures the best possible outcome.";
-
-               for (let i = 0; i < responseContent.length; i++) {
-                       currentContent += responseContent[i];
-                       streamingThinkMessage.content = currentContent;
-                       await new Promise((resolve) => setTimeout(resolve, 10));
-               }
-
-               streamingThinkMessage.timestamp = Date.now();
-       }}
->
-       <div class="w-[56rem]">
-               <ChatMessage message={streamingThinkMessage} />
-       </div>
-</Story>
-
-<Story
-       name="StreamingThinkBracket"
-       args={{
-               message: streamingBracketMessage
-       }}
-       parameters={{
-               test: {
-                       timeout: 30000
-               }
-       }}
-       asChild
-       play={async () => {
-               // Phase 1: Stream [THINK] reasoning content
-               const thinkingContent =
-                       'Using the DeepSeek format now:\n\n- This demonstrates the &#91;THINK&#93; bracket format\n- Should parse identically to &lt;think&gt; tags\n- The UI should display this in the thinking section\n- Main content should be separate\n\nBoth formats provide the same functionality.';
-
-               let currentContent = '[THINK]\n';
-               streamingBracketMessage.content = currentContent;
-
-               for (let i = 0; i < thinkingContent.length; i++) {
-                       currentContent += thinkingContent[i];
-                       streamingBracketMessage.content = currentContent;
-                       await new Promise((resolve) => setTimeout(resolve, 5));
-               }
-
-               // Close the thinking block
-               currentContent += '\n[/THINK]\n\n';
-               streamingBracketMessage.content = currentContent;
-               await new Promise((resolve) => setTimeout(resolve, 200));
-
-               // Phase 2: Stream main response content
-               const responseContent =
-                       "Here's my response after using the &#91;THINK&#93; format:\n\n**Observations:**\n- Both &lt;think&gt; and &#91;THINK&#93; formats work seamlessly\n- The parsing logic handles both cases\n- UI display is consistent across formats\n\nThis demonstrates the enhanced thinking content support.";
-
-               for (let i = 0; i < responseContent.length; i++) {
-                       currentContent += responseContent[i];
-                       streamingBracketMessage.content = currentContent;
-                       await new Promise((resolve) => setTimeout(resolve, 10));
-               }
-
-               streamingBracketMessage.timestamp = Date.now();
-       }}
->
-       <div class="w-[56rem]">
-               <ChatMessage message={streamingBracketMessage} />
-       </div>
-</Story>