From ae1115991894aca9e4973ef96ccd029f1f7a9431 Mon Sep 17 00:00:00 2001 From: zhayujie Date: Fri, 24 Apr 2026 15:22:45 +0800 Subject: [PATCH] feat(models): unify enable_thinking for deepseek-v4 and other thinking models --- README.md | 2 +- agent/protocol/agent_stream.py | 25 +++++++++++++++--- bridge/agent_bridge.py | 28 +++++++++++--------- config-template.json | 1 + config.py | 2 +- docs/channels/web.mdx | 2 +- docs/cli/general.mdx | 2 +- docs/en/cli/general.mdx | 2 +- docs/intro/architecture.mdx | 2 +- docs/ja/cli/general.mdx | 2 +- models/linkai/link_ai_bot.py | 44 ++++++++++++++++++++++++++++++++ skills/image-generation/SKILL.md | 2 +- 12 files changed, 91 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 4975ea72..fc03de3f 100644 --- a/README.md +++ b/README.md @@ -208,7 +208,7 @@ cow install-browser "agent_max_context_tokens": 50000, # Agent 模式下最大上下文 tokens,超出将自动智能压缩处理 "agent_max_context_turns": 20, # Agent 模式下最大上下文记忆轮次,一问一答为一轮,超出后智能压缩处理 "agent_max_steps": 20, # Agent 模式下单次任务的最大决策步数,超出后将停止继续调用工具 - "enable_thinking": false # 是否启用深度思考,开启后 Web 端展示模型推理过程,关闭后可加速响应 + "enable_thinking": false # 是否启用深度思考模式(适用于 deepseek-v4-pro/flash、deepseek-reasoner、kimi-k2-thinking 等思考型模型)。开启后模型在出最终回答前先进行推理,回答质量更高但首字延迟增加;Web 端会展示思考过程,IM 渠道(微信/企微/钉钉/飞书)虽不展示但同样获得更好答案 } ``` diff --git a/agent/protocol/agent_stream.py b/agent/protocol/agent_stream.py index 547603f0..4e077653 100644 --- a/agent/protocol/agent_stream.py +++ b/agent/protocol/agent_stream.py @@ -110,6 +110,23 @@ class AgentStreamExecutor: logger.error(f"Event callback error: {e}") def _is_thinking_enabled(self) -> bool: + """Whether deep-thinking mode is on at the model layer. + + Mirrors the global toggle used by ``bridge.agent_bridge`` when deciding + whether to send ``thinking={"type": "enabled"}`` to the model. Used for + logging and reasoning-update event emission across all channels. + """ + from config import conf + return bool(conf().get("enable_thinking", False)) + + def _should_render_thinking_inline(self) -> bool: + """Whether ``...`` blocks embedded directly in ``content`` + (MiniMax, some third-party proxies) should be surfaced to the channel. + + Only the Web console can render them in a collapsible panel. IM channels + (WeChat/WeCom/DingTalk/Feishu) must strip them, otherwise users see raw + XML tags in their chat. + """ from config import conf channel_type = getattr(self.model, 'channel_type', '') or '' return conf().get("enable_thinking", False) and channel_type == 'web' @@ -119,13 +136,15 @@ class AgentStreamExecutor: Handle ... blocks in content returned by some LLM providers (e.g., MiniMax). - - When thinking is enabled: remove the tags but keep the content inside. - - When thinking is disabled: remove both the tags and the content entirely. + - When inline thinking rendering is allowed (Web + thinking enabled): + remove only the tags, keep the content inside. + - Otherwise (IM channels, or thinking disabled globally): remove both + the tags and the content entirely. """ if not text: return text import re - if self._is_thinking_enabled(): + if self._should_render_thinking_inline(): text = re.sub(r'', '', text) text = re.sub(r'', '', text) else: diff --git a/bridge/agent_bridge.py b/bridge/agent_bridge.py index 2aac1221..3701df77 100644 --- a/bridge/agent_bridge.py +++ b/bridge/agent_bridge.py @@ -167,13 +167,15 @@ class AgentLLMModel(LLMModel): if session_id: kwargs['session_id'] = session_id - # Determine thinking: respect global config, then channel_type + # Thinking mode is a global toggle independent of the channel. + # IM channels (WeChat/WeCom/DingTalk/Feishu) won't render the + # reasoning trace, but still benefit from the higher answer + # quality the thinking pass produces. from config import conf - global_thinking = conf().get("enable_thinking", False) - if not global_thinking: - kwargs['thinking'] = {"type": "disabled"} - else: - kwargs['thinking'] = {"type": "enabled"} if channel_type == "web" else {"type": "disabled"} + kwargs['thinking'] = ( + {"type": "enabled"} if conf().get("enable_thinking", False) + else {"type": "disabled"} + ) response = self.bot.call_with_tools(**kwargs) return self._format_response(response) @@ -220,13 +222,15 @@ class AgentLLMModel(LLMModel): if session_id: kwargs['session_id'] = session_id - # Determine thinking: respect global config, then channel_type + # Thinking mode is a global toggle independent of the channel. + # IM channels (WeChat/WeCom/DingTalk/Feishu) won't render the + # reasoning trace, but still benefit from the higher answer + # quality the thinking pass produces. from config import conf - global_thinking = conf().get("enable_thinking", False) - if not global_thinking: - kwargs['thinking'] = {"type": "disabled"} - else: - kwargs['thinking'] = {"type": "enabled"} if channel_type == "web" else {"type": "disabled"} + kwargs['thinking'] = ( + {"type": "enabled"} if conf().get("enable_thinking", False) + else {"type": "disabled"} + ) stream = self.bot.call_with_tools(**kwargs) diff --git a/config-template.json b/config-template.json index d6b13223..ecef9166 100644 --- a/config-template.json +++ b/config-template.json @@ -31,5 +31,6 @@ "agent_max_context_tokens": 50000, "agent_max_context_turns": 20, "agent_max_steps": 20, + "enable_thinking": false, "knowledge": true } diff --git a/config.py b/config.py index 7c597ba7..d175c06f 100644 --- a/config.py +++ b/config.py @@ -204,7 +204,7 @@ available_setting = { "agent_max_context_tokens": 50000, # Agent模式下最大上下文tokens "agent_max_context_turns": 20, # Agent模式下最大上下文记忆轮次 "agent_max_steps": 20, # Agent模式下单次运行最大决策步数 - "enable_thinking": False, # Whether to enable deep thinking for web channel + "enable_thinking": False, # Toggle deep-thinking mode for thinking-capable models (e.g. deepseek-v4-pro/flash, deepseek-reasoner, kimi-k2-thinking). When enabled, the model produces a reasoning trace before the final answer; the Web console renders it in a collapsible panel, while IM channels (WeChat/WeCom/DingTalk/Feishu) still benefit from the improved answer quality but do not display the trace. Note: enabling thinking increases first-token latency. "knowledge": True, # 是否开启知识库功能 # Per-skill runtime config. Nested keys are flattened to env vars at startup # using the rule: skill[][] -> SKILL__ diff --git a/docs/channels/web.mdx b/docs/channels/web.mdx index 7ce84ad1..f1440325 100644 --- a/docs/channels/web.mdx +++ b/docs/channels/web.mdx @@ -22,7 +22,7 @@ Web 控制台是 CowAgent 的默认通道,启动后会自动运行,通过浏 | `web_port` | Web 服务监听端口 | `9899` | | `web_password` | 访问密码,留空表示不启用密码保护 | `""` | | `web_session_expire_days` | 登录会话有效天数 | `30` | -| `enable_thinking` | 是否启用深度思考,开启后 Web 端展示推理过程,关闭可加速响应 | `false` | +| `enable_thinking` | 全局深度思考开关(影响所有渠道)。开启后思考型模型(deepseek-v4-pro/flash、deepseek-reasoner、kimi-k2-thinking 等)会先推理再作答;Web 端会展示思考过程,回答质量更高但首字延迟增加 | `false` | 配置密码后,访问控制台时需先输入密码完成登录。登录状态默认保持 30 天,期间重启服务也无需重新登录。密码也支持在控制台的「配置」页面中在线修改。 diff --git a/docs/cli/general.mdx b/docs/cli/general.mdx index 5b972da0..31d383ea 100644 --- a/docs/cli/general.mdx +++ b/docs/cli/general.mdx @@ -69,7 +69,7 @@ Session: 12 messages | 8 skills loaded | `agent_max_context_tokens` | 最大上下文 tokens | `40000` | | `agent_max_context_turns` | 最大上下文记忆轮次 | `30` | | `agent_max_steps` | 单次任务最大决策步数 | `15` | -| `enable_thinking` | 是否启用深度思考 | `true` / `false` | +| `enable_thinking` | 是否启用深度思考(全局开关,对所有渠道生效;Web 端会展示思考过程,IM 渠道不展示但同样受益于更高的回答质量) | `true` / `false` | 修改 `model` 时,系统会自动匹配对应的模型调用方式。配置会写入 `config.json` 并持久保存。 diff --git a/docs/en/cli/general.mdx b/docs/en/cli/general.mdx index 8d6dce68..59cd11de 100644 --- a/docs/en/cli/general.mdx +++ b/docs/en/cli/general.mdx @@ -55,7 +55,7 @@ View or modify runtime configuration. Changes take effect immediately without re | `agent_max_context_tokens` | Max context tokens | `40000` | | `agent_max_context_turns` | Max context memory turns | `30` | | `agent_max_steps` | Max decision steps per task | `15` | -| `enable_thinking` | Enable deep thinking | `true` / `false` | +| `enable_thinking` | Enable deep thinking (global toggle, applies to all channels; Web console renders the reasoning trace, IM channels don't display it but still benefit from improved answer quality) | `true` / `false` | When changing `model`, the system automatically matches the corresponding model API. Configuration is persisted to `config.json`. diff --git a/docs/intro/architecture.mdx b/docs/intro/architecture.mdx index 8ca806c0..fdacc84a 100644 --- a/docs/intro/architecture.mdx +++ b/docs/intro/architecture.mdx @@ -81,5 +81,5 @@ Agent 的工作空间默认位于 `~/cow` 目录,用于存储系统提示词 | `agent_max_context_tokens` | 最大上下文 token 数 | `50000` | | `agent_max_context_turns` | 最大上下文记忆轮次 | `20` | | `agent_max_steps` | 单次任务最大决策步数 | `20` | -| `enable_thinking` | 是否启用深度思考,开启后 Web 端展示推理过程,关闭可加速响应 | `false` | +| `enable_thinking` | 是否启用深度思考模式(适用于 deepseek-v4-pro/flash、deepseek-reasoner、kimi-k2-thinking 等思考型模型)。开启后所有渠道下模型都会先思考再回答,回答质量更高但首字延迟增加;Web 端会展示思考过程,IM 渠道(微信/企微/钉钉/飞书)虽不展示但同样获得更好答案 | `false` | | `knowledge` | 是否启用个人知识库 | `true` | diff --git a/docs/ja/cli/general.mdx b/docs/ja/cli/general.mdx index de597b01..af4a83f7 100644 --- a/docs/ja/cli/general.mdx +++ b/docs/ja/cli/general.mdx @@ -55,7 +55,7 @@ description: ステータスの確認、設定管理、コンテキスト制御 | `agent_max_context_tokens` | 最大コンテキストトークン数 | `40000` | | `agent_max_context_turns` | 最大コンテキスト記憶ターン数 | `30` | | `agent_max_steps` | タスクごとの最大判断ステップ数 | `15` | -| `enable_thinking` | ディープシンキングの有効化 | `true` / `false` | +| `enable_thinking` | ディープシンキングの有効化(全チャネル共通のグローバルトグル。Web コンソールでは思考過程を折りたたみ表示、IM チャネルでは表示されないものの回答品質の向上は享受可能) | `true` / `false` | `model` を変更すると、システムが対応するモデル API を自動的にマッチングします。設定は `config.json` に永続的に保存されます。 diff --git a/models/linkai/link_ai_bot.py b/models/linkai/link_ai_bot.py index 6eb1e142..191b2133 100644 --- a/models/linkai/link_ai_bot.py +++ b/models/linkai/link_ai_bot.py @@ -686,7 +686,51 @@ def _handle_linkai_stream_response(self, base_url, headers, body): "status_code": 500 } +def _linkai_convert_messages_to_openai_format(self, messages): + """ + Override the base OpenAI-compatible conversion to round-trip + ``reasoning_content`` on assistant messages. + + Internally, the agent layer keeps the model's reasoning trace as a + Claude-style ``thinking`` content block on the assistant message. The + base converter drops that block. For thinking-capable models proxied via + LinkAI (DeepSeek V4, Kimi K2 thinking, …), the upstream API requires + the trace to be echoed back as a top-level ``reasoning_content`` field + on every assistant turn that contained tool calls — otherwise the next + request returns 400. We re-emit it for every assistant turn (it's + silently ignored on plain text turns). + """ + openai_messages = OpenAICompatibleBot._convert_messages_to_openai_format(self, messages) + if not messages: + return openai_messages + + # Walk the original Claude messages to collect each assistant turn's + # reasoning text, then attach it to the matching converted entry. + dst_idx = 0 + for src in messages: + if src.get("role") != "assistant": + continue + content = src.get("content") + reasoning_parts = [] + if isinstance(content, list): + reasoning_parts = [ + b.get("thinking", "") for b in content + if isinstance(b, dict) and b.get("type") == "thinking" + ] + # Locate the corresponding assistant entry in the converted list. + while dst_idx < len(openai_messages) and openai_messages[dst_idx].get("role") != "assistant": + dst_idx += 1 + if dst_idx >= len(openai_messages): + break + if reasoning_parts: + openai_messages[dst_idx]["reasoning_content"] = "\n".join(reasoning_parts) + dst_idx += 1 + + return openai_messages + + # Attach methods to LinkAIBot class LinkAIBot.call_with_tools = _linkai_call_with_tools LinkAIBot._handle_linkai_sync_response = _handle_linkai_sync_response LinkAIBot._handle_linkai_stream_response = _handle_linkai_stream_response +LinkAIBot._convert_messages_to_openai_format = _linkai_convert_messages_to_openai_format diff --git a/skills/image-generation/SKILL.md b/skills/image-generation/SKILL.md index b55e192c..d2601629 100644 --- a/skills/image-generation/SKILL.md +++ b/skills/image-generation/SKILL.md @@ -45,7 +45,7 @@ python /scripts/generate.py '' | `size` | string | no | auto | `512` / `1K` / `2K` / `3K` / `4K`, or pixel value (`1024x1024`) | | `aspect_ratio` | string | no | null | `1:1` / `3:2` / `2:3` / `16:9` / `9:16` / `21:9` (some backends also support extreme ratios like `1:4` / `8:1`) | -**Higher `quality` and larger `size` cost more and run slower.** Default to omitting both (`auto`) so the model picks a balanced setting. Only raise them when the user explicitly asks for high quality / a poster / print-ready output. For quick previews or chat scenarios prefer `quality=low` + `size=1K`. +**Higher `quality` and larger `size` cost more and run slower.** In normal cases, when the user does not explicitly specify, `low` or `medium` is sufficient. Only use `high` when the user asks for it. ### Example — generate