From a0748c2e3b449bbb4bf8872ad961881b8f2ac8da Mon Sep 17 00:00:00 2001 From: zhayujie Date: Tue, 21 Apr 2026 20:31:38 +0800 Subject: [PATCH] fix(web): cap reasoning content to 4KB across stream/storage/display --- agent/protocol/agent_stream.py | 39 +++++++++++++- channel/web/static/css/console.css | 16 ++++++ channel/web/static/js/console.js | 83 ++++++++++++++++++++++++++++-- channel/web/web_channel.py | 28 +++++++++- 4 files changed, 158 insertions(+), 8 deletions(-) diff --git a/agent/protocol/agent_stream.py b/agent/protocol/agent_stream.py index 569fcd5a..a0b0af5c 100644 --- a/agent/protocol/agent_stream.py +++ b/agent/protocol/agent_stream.py @@ -13,6 +13,37 @@ from agent.tools.base_tool import BaseTool, ToolResult from common.log import logger +# Maximum number of characters of model "reasoning / thinking" content to persist +# in conversation history. The full reasoning is still streamed to the UI in real +# time (subject to its own SSE / rendering limits); this bound only controls what +# is stored in DB and replayed in history. Long reasoning is not useful for later +# context (the LLM never sees thinking blocks anyway) and bloats DB. +# Keep aligned with the frontend REASONING_RENDER_CAP and the SSE +# MAX_REASONING_STREAM_CHARS so that storage / stream / display all match. +MAX_STORED_REASONING_CHARS = 4 * 1024 # 4 KB + +# Marker inserted between head and tail when reasoning is truncated. +_REASONING_TRUNCATE_MARKER = "\n\n... [reasoning truncated, {omitted} chars omitted] ...\n\n" + + +def _truncate_reasoning_for_storage(text: str) -> str: + """Trim long reasoning to head + tail with an omission marker. + + Keeps the first and last halves of MAX_STORED_REASONING_CHARS so both the + initial chain-of-thought and the final conclusions are preserved for UI + replay, without storing the entire (often very large) middle. + """ + if not text: + return text + if len(text) <= MAX_STORED_REASONING_CHARS: + return text + half = MAX_STORED_REASONING_CHARS // 2 + head = text[:half] + tail = text[-half:] + omitted = len(text) - len(head) - len(tail) + return head + _REASONING_TRUNCATE_MARKER.format(omitted=omitted) + tail + + class AgentStreamExecutor: """ Agent Stream Executor @@ -830,9 +861,15 @@ class AgentStreamExecutor: assistant_msg = {"role": "assistant", "content": []} if full_reasoning: + stored_reasoning = _truncate_reasoning_for_storage(full_reasoning) + if len(stored_reasoning) < len(full_reasoning): + logger.info( + f"[reasoning] truncated for storage: " + f"{len(full_reasoning)} -> {len(stored_reasoning)} chars" + ) assistant_msg["content"].append({ "type": "thinking", - "thinking": full_reasoning + "thinking": stored_reasoning }) if full_content: diff --git a/channel/web/static/css/console.css b/channel/web/static/css/console.css index 7f8b594c..44ae8c65 100644 --- a/channel/web/static/css/console.css +++ b/channel/web/static/css/console.css @@ -509,6 +509,22 @@ color: #b0b8c4; margin-bottom: 0.375rem; } +/* Streaming reasoning: render as plain pre to avoid expensive markdown + re-parsing on every chunk. Wrap long lines so the bubble width is + respected and use the same font size/color as the rendered version. */ +.agent-thinking-step .thinking-stream-pre { + margin: 0; + padding: 0; + background: transparent; + border: 0; + font-family: inherit; + font-size: inherit; + line-height: 1.5; + color: inherit; + white-space: pre-wrap; + word-break: break-word; + overflow-wrap: anywhere; +} /* Content step - real text output frozen before tool calls */ .agent-content-step { diff --git a/channel/web/static/js/console.js b/channel/web/static/js/console.js index 60fb4245..c0ae38a2 100644 --- a/channel/web/static/js/console.js +++ b/channel/web/static/js/console.js @@ -1026,17 +1026,60 @@ function startSSE(requestId, loadingEl, timestamp, titleInfo) { reasoningStartTime = Date.now(); currentReasoningEl = document.createElement('div'); currentReasoningEl.className = 'agent-step agent-thinking-step'; + // During streaming, use a
 with a single text node and
+                    // append-only updates. This avoids re-parsing markdown and
+                    // re-setting innerHTML on every chunk, which is what causes
+                    // the page to crash on long chains-of-thought.
                     currentReasoningEl.innerHTML = `
                         
${t('thinking_in_progress')}
-
`; +
`; stepsEl.appendChild(currentReasoningEl); + const preEl = currentReasoningEl.querySelector('.thinking-stream-pre'); + preEl.appendChild(document.createTextNode('')); + currentReasoningEl._streamTextNode = preEl.firstChild; + currentReasoningEl._streamPendingText = ''; + currentReasoningEl._streamRafScheduled = false; + currentReasoningEl._streamCharsRendered = 0; + currentReasoningEl._streamCapped = false; + } + // Hard cap: once REASONING_RENDER_CAP chars are in the DOM, stop + // appending further deltas. The full text is still kept in + // `reasoningText` for finalize-time head+tail rendering. + if (!currentReasoningEl._streamCapped) { + currentReasoningEl._streamPendingText += item.content; + if (!currentReasoningEl._streamRafScheduled) { + currentReasoningEl._streamRafScheduled = true; + const elRef = currentReasoningEl; + requestAnimationFrame(() => { + elRef._streamRafScheduled = false; + if (!elRef.isConnected || !elRef._streamTextNode) return; + let pending = elRef._streamPendingText; + elRef._streamPendingText = ''; + if (!pending) return; + const remaining = REASONING_RENDER_CAP - elRef._streamCharsRendered; + if (remaining <= 0) { + elRef._streamCapped = true; + } else { + if (pending.length > remaining) { + pending = pending.slice(0, remaining); + elRef._streamCapped = true; + } + elRef._streamTextNode.appendData(pending); + elRef._streamCharsRendered += pending.length; + if (elRef._streamCapped) { + elRef._streamTextNode.appendData( + '\n\n... [reasoning truncated for display] ...' + ); + } + } + scrollChatToBottom(); + }); + } } - currentReasoningEl.querySelector('.thinking-full').innerHTML = renderMarkdown(reasoningText); - scrollChatToBottom(); } else if (item.type === 'delta') { ensureBotEl(); @@ -1334,11 +1377,41 @@ function renderToolCallsHtml(toolCalls) { }).join(''); } +// Cap for rendering reasoning content in the bubble. Beyond this size, +// we skip markdown rendering entirely and show plain text head + tail to +// keep the page responsive (very long chains-of-thought can otherwise +// stall or crash the browser when re-parsed by marked.js). +// Keep this in sync with backend MAX_STORED_REASONING_CHARS and +// MAX_REASONING_STREAM_CHARS so storage / SSE / display stay aligned. +const REASONING_RENDER_CAP = 4 * 1024; // 4 KB + +function _truncateReasoningForDisplay(text) { + if (!text || text.length <= REASONING_RENDER_CAP) return { text, truncated: false, omitted: 0 }; + const half = Math.floor(REASONING_RENDER_CAP / 2); + const head = text.slice(0, half); + const tail = text.slice(-half); + return { + text: head + '\n\n... [' + (text.length - head.length - tail.length) + ' chars omitted] ...\n\n' + tail, + truncated: true, + omitted: text.length - head.length - tail.length, + }; +} + +function _renderReasoningBody(text) { + // For short reasoning, render as markdown. For long ones, fall back to + // an escaped
 block to avoid expensive markdown parsing.
+    const { text: shown, truncated } = _truncateReasoningForDisplay(text);
+    if (truncated || shown.length > REASONING_RENDER_CAP) {
+        return '
' + escapeHtml(shown) + '
'; + } + return renderMarkdown(shown); +} + function finalizeThinking(el, startTime, text) { const elapsed = ((Date.now() - startTime) / 1000).toFixed(1); el.querySelector('.thinking-summary').textContent = t('thinking_done'); const fullDiv = el.querySelector('.thinking-full'); - fullDiv.innerHTML = `
${t('thinking_duration')} ${elapsed}s
` + renderMarkdown(text); + fullDiv.innerHTML = `
${t('thinking_duration')} ${elapsed}s
` + _renderReasoningBody(text); } function renderThinkingHtml(text) { @@ -1351,7 +1424,7 @@ function renderThinkingHtml(text) { ${t('thinking_done')} -
${renderMarkdown(full)}
+
${_renderReasoningBody(full)}
`; } diff --git a/channel/web/web_channel.py b/channel/web/web_channel.py index 59102994..753e2d4c 100644 --- a/channel/web/web_channel.py +++ b/channel/web/web_channel.py @@ -225,6 +225,17 @@ class WebChannel(ChatChannel): def _make_sse_callback(self, request_id: str): """Build an on_event callback that pushes agent stream events into the SSE queue.""" + # Cap reasoning bytes pushed to the frontend per request to avoid + # browser stalls / crashes on very long chains-of-thought. Anything + # beyond the cap is dropped from the stream (DB still persists a + # truncated copy via _truncate_reasoning_for_storage). + # Keep aligned with frontend REASONING_RENDER_CAP and backend + # MAX_STORED_REASONING_CHARS. + MAX_REASONING_STREAM_CHARS = 4 * 1024 # 4 KB + # Use a single-element list as a mutable counter accessible from closure. + reasoning_chars_sent = [0] + reasoning_capped_notified = [False] + def on_event(event: dict): if request_id not in self.sse_queues: return @@ -234,8 +245,21 @@ class WebChannel(ChatChannel): if event_type == "reasoning_update": delta = data.get("delta", "") - if delta: - q.put({"type": "reasoning", "content": delta}) + if not delta: + return + remaining = MAX_REASONING_STREAM_CHARS - reasoning_chars_sent[0] + if remaining <= 0: + if not reasoning_capped_notified[0]: + reasoning_capped_notified[0] = True + q.put({ + "type": "reasoning", + "content": "\n\n... [reasoning truncated for display] ...", + }) + return + if len(delta) > remaining: + delta = delta[:remaining] + reasoning_chars_sent[0] += len(delta) + q.put({"type": "reasoning", "content": delta}) elif event_type == "message_update": delta = data.get("delta", "")