mirror of
https://github.com/zhayujie/chatgpt-on-wechat.git
synced 2026-06-02 00:57:41 +08:00
feat(models): unify enable_thinking for deepseek-v4 and other thinking models
This commit is contained in:
@@ -208,7 +208,7 @@ cow install-browser
|
||||
"agent_max_context_tokens": 50000, # Agent 模式下最大上下文 tokens,超出将自动智能压缩处理
|
||||
"agent_max_context_turns": 20, # Agent 模式下最大上下文记忆轮次,一问一答为一轮,超出后智能压缩处理
|
||||
"agent_max_steps": 20, # Agent 模式下单次任务的最大决策步数,超出后将停止继续调用工具
|
||||
"enable_thinking": false # 是否启用深度思考,开启后 Web 端展示模型推理过程,关闭后可加速响应
|
||||
"enable_thinking": false # 是否启用深度思考模式(适用于 deepseek-v4-pro/flash、deepseek-reasoner、kimi-k2-thinking 等思考型模型)。开启后模型在出最终回答前先进行推理,回答质量更高但首字延迟增加;Web 端会展示思考过程,IM 渠道(微信/企微/钉钉/飞书)虽不展示但同样获得更好答案
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
@@ -110,6 +110,23 @@ class AgentStreamExecutor:
|
||||
logger.error(f"Event callback error: {e}")
|
||||
|
||||
def _is_thinking_enabled(self) -> bool:
|
||||
"""Whether deep-thinking mode is on at the model layer.
|
||||
|
||||
Mirrors the global toggle used by ``bridge.agent_bridge`` when deciding
|
||||
whether to send ``thinking={"type": "enabled"}`` to the model. Used for
|
||||
logging and reasoning-update event emission across all channels.
|
||||
"""
|
||||
from config import conf
|
||||
return bool(conf().get("enable_thinking", False))
|
||||
|
||||
def _should_render_thinking_inline(self) -> bool:
|
||||
"""Whether ``<think>...</think>`` blocks embedded directly in ``content``
|
||||
(MiniMax, some third-party proxies) should be surfaced to the channel.
|
||||
|
||||
Only the Web console can render them in a collapsible panel. IM channels
|
||||
(WeChat/WeCom/DingTalk/Feishu) must strip them, otherwise users see raw
|
||||
XML tags in their chat.
|
||||
"""
|
||||
from config import conf
|
||||
channel_type = getattr(self.model, 'channel_type', '') or ''
|
||||
return conf().get("enable_thinking", False) and channel_type == 'web'
|
||||
@@ -119,13 +136,15 @@ class AgentStreamExecutor:
|
||||
Handle <think>...</think> blocks in content returned by some LLM providers
|
||||
(e.g., MiniMax).
|
||||
|
||||
- When thinking is enabled: remove the tags but keep the content inside.
|
||||
- When thinking is disabled: remove both the tags and the content entirely.
|
||||
- When inline thinking rendering is allowed (Web + thinking enabled):
|
||||
remove only the tags, keep the content inside.
|
||||
- Otherwise (IM channels, or thinking disabled globally): remove both
|
||||
the tags and the content entirely.
|
||||
"""
|
||||
if not text:
|
||||
return text
|
||||
import re
|
||||
if self._is_thinking_enabled():
|
||||
if self._should_render_thinking_inline():
|
||||
text = re.sub(r'<think>', '', text)
|
||||
text = re.sub(r'</think>', '', text)
|
||||
else:
|
||||
|
||||
@@ -167,13 +167,15 @@ class AgentLLMModel(LLMModel):
|
||||
if session_id:
|
||||
kwargs['session_id'] = session_id
|
||||
|
||||
# Determine thinking: respect global config, then channel_type
|
||||
# Thinking mode is a global toggle independent of the channel.
|
||||
# IM channels (WeChat/WeCom/DingTalk/Feishu) won't render the
|
||||
# reasoning trace, but still benefit from the higher answer
|
||||
# quality the thinking pass produces.
|
||||
from config import conf
|
||||
global_thinking = conf().get("enable_thinking", False)
|
||||
if not global_thinking:
|
||||
kwargs['thinking'] = {"type": "disabled"}
|
||||
else:
|
||||
kwargs['thinking'] = {"type": "enabled"} if channel_type == "web" else {"type": "disabled"}
|
||||
kwargs['thinking'] = (
|
||||
{"type": "enabled"} if conf().get("enable_thinking", False)
|
||||
else {"type": "disabled"}
|
||||
)
|
||||
|
||||
response = self.bot.call_with_tools(**kwargs)
|
||||
return self._format_response(response)
|
||||
@@ -220,13 +222,15 @@ class AgentLLMModel(LLMModel):
|
||||
if session_id:
|
||||
kwargs['session_id'] = session_id
|
||||
|
||||
# Determine thinking: respect global config, then channel_type
|
||||
# Thinking mode is a global toggle independent of the channel.
|
||||
# IM channels (WeChat/WeCom/DingTalk/Feishu) won't render the
|
||||
# reasoning trace, but still benefit from the higher answer
|
||||
# quality the thinking pass produces.
|
||||
from config import conf
|
||||
global_thinking = conf().get("enable_thinking", False)
|
||||
if not global_thinking:
|
||||
kwargs['thinking'] = {"type": "disabled"}
|
||||
else:
|
||||
kwargs['thinking'] = {"type": "enabled"} if channel_type == "web" else {"type": "disabled"}
|
||||
kwargs['thinking'] = (
|
||||
{"type": "enabled"} if conf().get("enable_thinking", False)
|
||||
else {"type": "disabled"}
|
||||
)
|
||||
|
||||
stream = self.bot.call_with_tools(**kwargs)
|
||||
|
||||
|
||||
@@ -31,5 +31,6 @@
|
||||
"agent_max_context_tokens": 50000,
|
||||
"agent_max_context_turns": 20,
|
||||
"agent_max_steps": 20,
|
||||
"enable_thinking": false,
|
||||
"knowledge": true
|
||||
}
|
||||
|
||||
@@ -204,7 +204,7 @@ available_setting = {
|
||||
"agent_max_context_tokens": 50000, # Agent模式下最大上下文tokens
|
||||
"agent_max_context_turns": 20, # Agent模式下最大上下文记忆轮次
|
||||
"agent_max_steps": 20, # Agent模式下单次运行最大决策步数
|
||||
"enable_thinking": False, # Whether to enable deep thinking for web channel
|
||||
"enable_thinking": False, # Toggle deep-thinking mode for thinking-capable models (e.g. deepseek-v4-pro/flash, deepseek-reasoner, kimi-k2-thinking). When enabled, the model produces a reasoning trace before the final answer; the Web console renders it in a collapsible panel, while IM channels (WeChat/WeCom/DingTalk/Feishu) still benefit from the improved answer quality but do not display the trace. Note: enabling thinking increases first-token latency.
|
||||
"knowledge": True, # 是否开启知识库功能
|
||||
# Per-skill runtime config. Nested keys are flattened to env vars at startup
|
||||
# using the rule: skill[<name>][<key>] -> SKILL_<NAME>_<KEY>
|
||||
|
||||
@@ -22,7 +22,7 @@ Web 控制台是 CowAgent 的默认通道,启动后会自动运行,通过浏
|
||||
| `web_port` | Web 服务监听端口 | `9899` |
|
||||
| `web_password` | 访问密码,留空表示不启用密码保护 | `""` |
|
||||
| `web_session_expire_days` | 登录会话有效天数 | `30` |
|
||||
| `enable_thinking` | 是否启用深度思考,开启后 Web 端展示推理过程,关闭可加速响应 | `false` |
|
||||
| `enable_thinking` | 全局深度思考开关(影响所有渠道)。开启后思考型模型(deepseek-v4-pro/flash、deepseek-reasoner、kimi-k2-thinking 等)会先推理再作答;Web 端会展示思考过程,回答质量更高但首字延迟增加 | `false` |
|
||||
|
||||
配置密码后,访问控制台时需先输入密码完成登录。登录状态默认保持 30 天,期间重启服务也无需重新登录。密码也支持在控制台的「配置」页面中在线修改。
|
||||
|
||||
|
||||
@@ -69,7 +69,7 @@ Session: 12 messages | 8 skills loaded
|
||||
| `agent_max_context_tokens` | 最大上下文 tokens | `40000` |
|
||||
| `agent_max_context_turns` | 最大上下文记忆轮次 | `30` |
|
||||
| `agent_max_steps` | 单次任务最大决策步数 | `15` |
|
||||
| `enable_thinking` | 是否启用深度思考 | `true` / `false` |
|
||||
| `enable_thinking` | 是否启用深度思考(全局开关,对所有渠道生效;Web 端会展示思考过程,IM 渠道不展示但同样受益于更高的回答质量) | `true` / `false` |
|
||||
|
||||
<Note>
|
||||
修改 `model` 时,系统会自动匹配对应的模型调用方式。配置会写入 `config.json` 并持久保存。
|
||||
|
||||
@@ -55,7 +55,7 @@ View or modify runtime configuration. Changes take effect immediately without re
|
||||
| `agent_max_context_tokens` | Max context tokens | `40000` |
|
||||
| `agent_max_context_turns` | Max context memory turns | `30` |
|
||||
| `agent_max_steps` | Max decision steps per task | `15` |
|
||||
| `enable_thinking` | Enable deep thinking | `true` / `false` |
|
||||
| `enable_thinking` | Enable deep thinking (global toggle, applies to all channels; Web console renders the reasoning trace, IM channels don't display it but still benefit from improved answer quality) | `true` / `false` |
|
||||
|
||||
<Note>
|
||||
When changing `model`, the system automatically matches the corresponding model API. Configuration is persisted to `config.json`.
|
||||
|
||||
@@ -81,5 +81,5 @@ Agent 的工作空间默认位于 `~/cow` 目录,用于存储系统提示词
|
||||
| `agent_max_context_tokens` | 最大上下文 token 数 | `50000` |
|
||||
| `agent_max_context_turns` | 最大上下文记忆轮次 | `20` |
|
||||
| `agent_max_steps` | 单次任务最大决策步数 | `20` |
|
||||
| `enable_thinking` | 是否启用深度思考,开启后 Web 端展示推理过程,关闭可加速响应 | `false` |
|
||||
| `enable_thinking` | 是否启用深度思考模式(适用于 deepseek-v4-pro/flash、deepseek-reasoner、kimi-k2-thinking 等思考型模型)。开启后所有渠道下模型都会先思考再回答,回答质量更高但首字延迟增加;Web 端会展示思考过程,IM 渠道(微信/企微/钉钉/飞书)虽不展示但同样获得更好答案 | `false` |
|
||||
| `knowledge` | 是否启用个人知识库 | `true` |
|
||||
|
||||
@@ -55,7 +55,7 @@ description: ステータスの確認、設定管理、コンテキスト制御
|
||||
| `agent_max_context_tokens` | 最大コンテキストトークン数 | `40000` |
|
||||
| `agent_max_context_turns` | 最大コンテキスト記憶ターン数 | `30` |
|
||||
| `agent_max_steps` | タスクごとの最大判断ステップ数 | `15` |
|
||||
| `enable_thinking` | ディープシンキングの有効化 | `true` / `false` |
|
||||
| `enable_thinking` | ディープシンキングの有効化(全チャネル共通のグローバルトグル。Web コンソールでは思考過程を折りたたみ表示、IM チャネルでは表示されないものの回答品質の向上は享受可能) | `true` / `false` |
|
||||
|
||||
<Note>
|
||||
`model` を変更すると、システムが対応するモデル API を自動的にマッチングします。設定は `config.json` に永続的に保存されます。
|
||||
|
||||
@@ -686,7 +686,51 @@ def _handle_linkai_stream_response(self, base_url, headers, body):
|
||||
"status_code": 500
|
||||
}
|
||||
|
||||
def _linkai_convert_messages_to_openai_format(self, messages):
|
||||
"""
|
||||
Override the base OpenAI-compatible conversion to round-trip
|
||||
``reasoning_content`` on assistant messages.
|
||||
|
||||
Internally, the agent layer keeps the model's reasoning trace as a
|
||||
Claude-style ``thinking`` content block on the assistant message. The
|
||||
base converter drops that block. For thinking-capable models proxied via
|
||||
LinkAI (DeepSeek V4, Kimi K2 thinking, …), the upstream API requires
|
||||
the trace to be echoed back as a top-level ``reasoning_content`` field
|
||||
on every assistant turn that contained tool calls — otherwise the next
|
||||
request returns 400. We re-emit it for every assistant turn (it's
|
||||
silently ignored on plain text turns).
|
||||
"""
|
||||
openai_messages = OpenAICompatibleBot._convert_messages_to_openai_format(self, messages)
|
||||
if not messages:
|
||||
return openai_messages
|
||||
|
||||
# Walk the original Claude messages to collect each assistant turn's
|
||||
# reasoning text, then attach it to the matching converted entry.
|
||||
dst_idx = 0
|
||||
for src in messages:
|
||||
if src.get("role") != "assistant":
|
||||
continue
|
||||
content = src.get("content")
|
||||
reasoning_parts = []
|
||||
if isinstance(content, list):
|
||||
reasoning_parts = [
|
||||
b.get("thinking", "") for b in content
|
||||
if isinstance(b, dict) and b.get("type") == "thinking"
|
||||
]
|
||||
# Locate the corresponding assistant entry in the converted list.
|
||||
while dst_idx < len(openai_messages) and openai_messages[dst_idx].get("role") != "assistant":
|
||||
dst_idx += 1
|
||||
if dst_idx >= len(openai_messages):
|
||||
break
|
||||
if reasoning_parts:
|
||||
openai_messages[dst_idx]["reasoning_content"] = "\n".join(reasoning_parts)
|
||||
dst_idx += 1
|
||||
|
||||
return openai_messages
|
||||
|
||||
|
||||
# Attach methods to LinkAIBot class
|
||||
LinkAIBot.call_with_tools = _linkai_call_with_tools
|
||||
LinkAIBot._handle_linkai_sync_response = _handle_linkai_sync_response
|
||||
LinkAIBot._handle_linkai_stream_response = _handle_linkai_stream_response
|
||||
LinkAIBot._convert_messages_to_openai_format = _linkai_convert_messages_to_openai_format
|
||||
|
||||
@@ -45,7 +45,7 @@ python <base_dir>/scripts/generate.py '<json_args>'
|
||||
| `size` | string | no | auto | `512` / `1K` / `2K` / `3K` / `4K`, or pixel value (`1024x1024`) |
|
||||
| `aspect_ratio` | string | no | null | `1:1` / `3:2` / `2:3` / `16:9` / `9:16` / `21:9` (some backends also support extreme ratios like `1:4` / `8:1`) |
|
||||
|
||||
**Higher `quality` and larger `size` cost more and run slower.** Default to omitting both (`auto`) so the model picks a balanced setting. Only raise them when the user explicitly asks for high quality / a poster / print-ready output. For quick previews or chat scenarios prefer `quality=low` + `size=1K`.
|
||||
**Higher `quality` and larger `size` cost more and run slower.** In normal cases, when the user does not explicitly specify, `low` or `medium` is sufficient. Only use `high` when the user asks for it.
|
||||
|
||||
### Example — generate
|
||||
|
||||
|
||||
Reference in New Issue
Block a user