From a0748c2e3b449bbb4bf8872ad961881b8f2ac8da Mon Sep 17 00:00:00 2001
From: zhayujie <yjzha1996@163.com>
Date: Tue, 21 Apr 2026 20:31:38 +0800
Subject: [PATCH] fix(web): cap reasoning content to 4KB across
 stream/storage/display

---
 agent/protocol/agent_stream.py     | 39 +++++++++++++-
 channel/web/static/css/console.css | 16 ++++++
 channel/web/static/js/console.js   | 83 ++++++++++++++++++++++++++++--
 channel/web/web_channel.py         | 28 +++++++++-
 4 files changed, 158 insertions(+), 8 deletions(-)

diff --git a/agent/protocol/agent_stream.py b/agent/protocol/agent_stream.py
index 569fcd5a..a0b0af5c 100644
--- a/agent/protocol/agent_stream.py
+++ b/agent/protocol/agent_stream.py
@@ -13,6 +13,37 @@ from agent.tools.base_tool import BaseTool, ToolResult
 from common.log import logger
 
 
+# Maximum number of characters of model "reasoning / thinking" content to persist
+# in conversation history. The full reasoning is still streamed to the UI in real
+# time (subject to its own SSE / rendering limits); this bound only controls what
+# is stored in DB and replayed in history. Long reasoning is not useful for later
+# context (the LLM never sees thinking blocks anyway) and bloats DB.
+# Keep aligned with the frontend REASONING_RENDER_CAP and the SSE
+# MAX_REASONING_STREAM_CHARS so that storage / stream / display all match.
+MAX_STORED_REASONING_CHARS = 4 * 1024  # 4 KB
+
+# Marker inserted between head and tail when reasoning is truncated.
+_REASONING_TRUNCATE_MARKER = "\n\n... [reasoning truncated, {omitted} chars omitted] ...\n\n"
+
+
+def _truncate_reasoning_for_storage(text: str) -> str:
+    """Trim long reasoning to head + tail with an omission marker.
+
+    Keeps the first and last halves of MAX_STORED_REASONING_CHARS so both the
+    initial chain-of-thought and the final conclusions are preserved for UI
+    replay, without storing the entire (often very large) middle.
+    """
+    if not text:
+        return text
+    if len(text) <= MAX_STORED_REASONING_CHARS:
+        return text
+    half = MAX_STORED_REASONING_CHARS // 2
+    head = text[:half]
+    tail = text[-half:]
+    omitted = len(text) - len(head) - len(tail)
+    return head + _REASONING_TRUNCATE_MARKER.format(omitted=omitted) + tail
+
+
 class AgentStreamExecutor:
     """
     Agent Stream Executor
@@ -830,9 +861,15 @@ class AgentStreamExecutor:
         assistant_msg = {"role": "assistant", "content": []}
 
         if full_reasoning:
+            stored_reasoning = _truncate_reasoning_for_storage(full_reasoning)
+            if len(stored_reasoning) < len(full_reasoning):
+                logger.info(
+                    f"[reasoning] truncated for storage: "
+                    f"{len(full_reasoning)} -> {len(stored_reasoning)} chars"
+                )
             assistant_msg["content"].append({
                 "type": "thinking",
-                "thinking": full_reasoning
+                "thinking": stored_reasoning
             })
 
         if full_content:
diff --git a/channel/web/static/css/console.css b/channel/web/static/css/console.css
index 7f8b594c..44ae8c65 100644
--- a/channel/web/static/css/console.css
+++ b/channel/web/static/css/console.css
@@ -509,6 +509,22 @@
     color: #b0b8c4;
     margin-bottom: 0.375rem;
 }
+/* Streaming reasoning: render as plain pre to avoid expensive markdown
+   re-parsing on every chunk. Wrap long lines so the bubble width is
+   respected and use the same font size/color as the rendered version. */
+.agent-thinking-step .thinking-stream-pre {
+    margin: 0;
+    padding: 0;
+    background: transparent;
+    border: 0;
+    font-family: inherit;
+    font-size: inherit;
+    line-height: 1.5;
+    color: inherit;
+    white-space: pre-wrap;
+    word-break: break-word;
+    overflow-wrap: anywhere;
+}
 
 /* Content step - real text output frozen before tool calls */
 .agent-content-step {
diff --git a/channel/web/static/js/console.js b/channel/web/static/js/console.js
index 60fb4245..c0ae38a2 100644
--- a/channel/web/static/js/console.js
+++ b/channel/web/static/js/console.js
@@ -1026,17 +1026,60 @@ function startSSE(requestId, loadingEl, timestamp, titleInfo) {
                     reasoningStartTime = Date.now();
                     currentReasoningEl = document.createElement('div');
                     currentReasoningEl.className = 'agent-step agent-thinking-step';
+                    // During streaming, use a <pre> with a single text node and
+                    // append-only updates. This avoids re-parsing markdown and
+                    // re-setting innerHTML on every chunk, which is what causes
+                    // the page to crash on long chains-of-thought.
                     currentReasoningEl.innerHTML = `
                         <div class="thinking-header" onclick="this.parentElement.classList.toggle('expanded')">
                             <i class="fas fa-lightbulb text-amber-400 flex-shrink-0"></i>
                             <span class="thinking-summary">${t('thinking_in_progress')}</span>
                             <i class="fas fa-chevron-right thinking-chevron"></i>
                         </div>
-                        <div class="thinking-full"></div>`;
+                        <div class="thinking-full"><pre class="thinking-stream-pre"></pre></div>`;
                     stepsEl.appendChild(currentReasoningEl);
+                    const preEl = currentReasoningEl.querySelector('.thinking-stream-pre');
+                    preEl.appendChild(document.createTextNode(''));
+                    currentReasoningEl._streamTextNode = preEl.firstChild;
+                    currentReasoningEl._streamPendingText = '';
+                    currentReasoningEl._streamRafScheduled = false;
+                    currentReasoningEl._streamCharsRendered = 0;
+                    currentReasoningEl._streamCapped = false;
+                }
+                // Hard cap: once REASONING_RENDER_CAP chars are in the DOM, stop
+                // appending further deltas. The full text is still kept in
+                // `reasoningText` for finalize-time head+tail rendering.
+                if (!currentReasoningEl._streamCapped) {
+                    currentReasoningEl._streamPendingText += item.content;
+                    if (!currentReasoningEl._streamRafScheduled) {
+                        currentReasoningEl._streamRafScheduled = true;
+                        const elRef = currentReasoningEl;
+                        requestAnimationFrame(() => {
+                            elRef._streamRafScheduled = false;
+                            if (!elRef.isConnected || !elRef._streamTextNode) return;
+                            let pending = elRef._streamPendingText;
+                            elRef._streamPendingText = '';
+                            if (!pending) return;
+                            const remaining = REASONING_RENDER_CAP - elRef._streamCharsRendered;
+                            if (remaining <= 0) {
+                                elRef._streamCapped = true;
+                            } else {
+                                if (pending.length > remaining) {
+                                    pending = pending.slice(0, remaining);
+                                    elRef._streamCapped = true;
+                                }
+                                elRef._streamTextNode.appendData(pending);
+                                elRef._streamCharsRendered += pending.length;
+                                if (elRef._streamCapped) {
+                                    elRef._streamTextNode.appendData(
+                                        '\n\n... [reasoning truncated for display] ...'
+                                    );
+                                }
+                            }
+                            scrollChatToBottom();
+                        });
+                    }
                 }
-                currentReasoningEl.querySelector('.thinking-full').innerHTML = renderMarkdown(reasoningText);
-                scrollChatToBottom();
 
             } else if (item.type === 'delta') {
                 ensureBotEl();
@@ -1334,11 +1377,41 @@ function renderToolCallsHtml(toolCalls) {
     }).join('');
 }
 
+// Cap for rendering reasoning content in the bubble. Beyond this size,
+// we skip markdown rendering entirely and show plain text head + tail to
+// keep the page responsive (very long chains-of-thought can otherwise
+// stall or crash the browser when re-parsed by marked.js).
+// Keep this in sync with backend MAX_STORED_REASONING_CHARS and
+// MAX_REASONING_STREAM_CHARS so storage / SSE / display stay aligned.
+const REASONING_RENDER_CAP = 4 * 1024; // 4 KB
+
+function _truncateReasoningForDisplay(text) {
+    if (!text || text.length <= REASONING_RENDER_CAP) return { text, truncated: false, omitted: 0 };
+    const half = Math.floor(REASONING_RENDER_CAP / 2);
+    const head = text.slice(0, half);
+    const tail = text.slice(-half);
+    return {
+        text: head + '\n\n... [' + (text.length - head.length - tail.length) + ' chars omitted] ...\n\n' + tail,
+        truncated: true,
+        omitted: text.length - head.length - tail.length,
+    };
+}
+
+function _renderReasoningBody(text) {
+    // For short reasoning, render as markdown. For long ones, fall back to
+    // an escaped <pre> block to avoid expensive markdown parsing.
+    const { text: shown, truncated } = _truncateReasoningForDisplay(text);
+    if (truncated || shown.length > REASONING_RENDER_CAP) {
+        return '<pre class="thinking-stream-pre">' + escapeHtml(shown) + '</pre>';
+    }
+    return renderMarkdown(shown);
+}
+
 function finalizeThinking(el, startTime, text) {
     const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
     el.querySelector('.thinking-summary').textContent = t('thinking_done');
     const fullDiv = el.querySelector('.thinking-full');
-    fullDiv.innerHTML = `<div class="thinking-duration">${t('thinking_duration')} ${elapsed}s</div>` + renderMarkdown(text);
+    fullDiv.innerHTML = `<div class="thinking-duration">${t('thinking_duration')} ${elapsed}s</div>` + _renderReasoningBody(text);
 }
 
 function renderThinkingHtml(text) {
@@ -1351,7 +1424,7 @@ function renderThinkingHtml(text) {
         <span class="thinking-summary">${t('thinking_done')}</span>
         <i class="fas fa-chevron-right thinking-chevron"></i>
     </div>
-    <div class="thinking-full">${renderMarkdown(full)}</div>
+    <div class="thinking-full">${_renderReasoningBody(full)}</div>
 </div>`;
 }
 
diff --git a/channel/web/web_channel.py b/channel/web/web_channel.py
index 59102994..753e2d4c 100644
--- a/channel/web/web_channel.py
+++ b/channel/web/web_channel.py
@@ -225,6 +225,17 @@ class WebChannel(ChatChannel):
     def _make_sse_callback(self, request_id: str):
         """Build an on_event callback that pushes agent stream events into the SSE queue."""
 
+        # Cap reasoning bytes pushed to the frontend per request to avoid
+        # browser stalls / crashes on very long chains-of-thought. Anything
+        # beyond the cap is dropped from the stream (DB still persists a
+        # truncated copy via _truncate_reasoning_for_storage).
+        # Keep aligned with frontend REASONING_RENDER_CAP and backend
+        # MAX_STORED_REASONING_CHARS.
+        MAX_REASONING_STREAM_CHARS = 4 * 1024  # 4 KB
+        # Use a single-element list as a mutable counter accessible from closure.
+        reasoning_chars_sent = [0]
+        reasoning_capped_notified = [False]
+
         def on_event(event: dict):
             if request_id not in self.sse_queues:
                 return
@@ -234,8 +245,21 @@ class WebChannel(ChatChannel):
 
             if event_type == "reasoning_update":
                 delta = data.get("delta", "")
-                if delta:
-                    q.put({"type": "reasoning", "content": delta})
+                if not delta:
+                    return
+                remaining = MAX_REASONING_STREAM_CHARS - reasoning_chars_sent[0]
+                if remaining <= 0:
+                    if not reasoning_capped_notified[0]:
+                        reasoning_capped_notified[0] = True
+                        q.put({
+                            "type": "reasoning",
+                            "content": "\n\n... [reasoning truncated for display] ...",
+                        })
+                    return
+                if len(delta) > remaining:
+                    delta = delta[:remaining]
+                reasoning_chars_sent[0] += len(delta)
+                q.put({"type": "reasoning", "content": delta})
 
             elif event_type == "message_update":
                 delta = data.get("delta", "")