feat(voice): rework TTS/ASR stack and unify tool/skill config schema

2026-07-19 12:47:25 +08:00 · 2026-05-21 16:00:54 +08:00
parent 2b90f377e6
commit b8333e351c
31 changed files with 1551 additions and 335 deletions
--- a/agent/memory/conversation_store.py
+++ b/agent/memory/conversation_store.py
@@ -44,6 +44,7 @@ CREATE TABLE IF NOT EXISTS messages (
    role         TEXT    NOT NULL,
    content      TEXT    NOT NULL,
    created_at   INTEGER NOT NULL,
    extras       TEXT    NOT NULL DEFAULT '',
    UNIQUE (session_id, seq)
 );
@@ -67,6 +68,12 @@ _MIGRATION_ADD_CONTEXT_START_SEQ = """
 ALTER TABLE sessions ADD COLUMN context_start_seq INTEGER NOT NULL DEFAULT 0;
 """
 # Generic JSON sidecar for per-message attachments (TTS audio URL, future use).
 # Always optional — readers must tolerate missing column / empty / invalid JSON.
 _MIGRATION_ADD_MSG_EXTRAS = """
 ALTER TABLE messages ADD COLUMN extras TEXT NOT NULL DEFAULT '';
 """
 DEFAULT_MAX_AGE_DAYS: int = 30
@@ -169,20 +176,26 @@ def _group_into_display_turns(
    cur_rest: List[tuple] = []
    started = False
-    for role, raw_content, created_at in rows:
+    for role, raw_content, created_at, raw_extras in rows:
        try:
            content = json.loads(raw_content)
        except Exception:
            content = raw_content
        try:
            extras = json.loads(raw_extras) if raw_extras else {}
            if not isinstance(extras, dict):
                extras = {}
        except Exception:
            extras = {}
        if role == "user" and _is_visible_user_message(content):
            if started:
                groups.append((cur_user, cur_rest))
-            cur_user = (content, created_at)
+            cur_user = (content, created_at, extras)
            cur_rest = []
            started = True
        else:
-            cur_rest.append((role, content, created_at))
+            cur_rest.append((role, content, created_at, extras))
    if started:
        groups.append((cur_user, cur_rest))
@@ -195,7 +208,7 @@ def _group_into_display_turns(
    for user_row, rest in groups:
        # User turn
        if user_row:
-            content, created_at = user_row
+            content, created_at, _u_extras = user_row
            text = _extract_display_text(content)
            if text:
                turns.append({"role": "user", "content": text, "created_at": created_at})
@@ -206,8 +219,11 @@ def _group_into_display_turns(
        tool_results: Dict[str, str] = {}
        final_text = ""
        final_ts: Optional[int] = None
        merged_extras: Dict[str, Any] = {}
-        for role, content, created_at in rest:
+        for role, content, created_at, extras in rest:
            if role == "assistant" and isinstance(extras, dict):
                merged_extras.update(extras)
            if role == "user":
                tool_results.update(_extract_tool_results(content))
            elif role == "assistant":
@@ -256,6 +272,8 @@ def _group_into_display_turns(
                "steps": steps,
                "created_at": final_ts or (user_row[1] if user_row else 0),
            }
            if merged_extras:
                turn["extras"] = merged_extras
            turns.append(turn)
    return turns
@@ -411,13 +429,15 @@ class ConversationStore:
                        content = json.dumps(
                            msg.get("content", ""), ensure_ascii=False
                        )
                        extras_obj = msg.get("extras") or {}
                        extras = json.dumps(extras_obj, ensure_ascii=False) if extras_obj else ""
                        conn.execute(
                            """
                            INSERT OR IGNORE INTO messages
-                                (session_id, seq, role, content, created_at)
+                                (session_id, seq, role, content, created_at, extras)
-                            VALUES (?, ?, ?, ?, ?)
+                            VALUES (?, ?, ?, ?, ?, ?)
                            """,
-                            (session_id, next_seq, role, content, now),
+                            (session_id, next_seq, role, content, now, extras),
                        )
                        next_seq += 1
@@ -651,6 +671,55 @@ class ConversationStore:
            logger.info(f"[ConversationStore] Pruned {deleted} expired sessions")
        return deleted
    def attach_extras_to_last_assistant(
        self,
        session_id: str,
        extras: Dict[str, Any],
    ) -> Optional[int]:
        """
        Merge ``extras`` into the latest assistant message of a session.
        Used by post-processing (e.g. TTS) that needs to annotate an already
        persisted bot reply with attachments such as audio URLs.
        Returns the message seq that was updated, or ``None`` if no assistant
        message exists or the update could not be applied.
        """
        if not extras:
            return None
        with self._lock:
            conn = self._connect()
            try:
                row = conn.execute(
                    """
                    SELECT seq, extras FROM messages
                    WHERE session_id = ? AND role = 'assistant'
                    ORDER BY seq DESC LIMIT 1
                    """,
                    (session_id,),
                ).fetchone()
                if not row:
                    return None
                seq, raw = row
                try:
                    cur = json.loads(raw) if raw else {}
                    if not isinstance(cur, dict):
                        cur = {}
                except Exception:
                    cur = {}
                cur.update(extras)
                conn.execute(
                    "UPDATE messages SET extras = ? WHERE session_id = ? AND seq = ?",
                    (json.dumps(cur, ensure_ascii=False), session_id, seq),
                )
                conn.commit()
                return seq
            except Exception as e:
                logger.warning(f"[ConversationStore] attach_extras failed: {e}")
                return None
            finally:
                conn.close()
    def load_history_page(
        self,
        session_id: str,
@@ -698,7 +767,22 @@ class ConversationStore:
                ).fetchone()
                ctx_start = ctx_row[0] if ctx_row else 0
                # extras column is added by migration; tolerate older DBs that
                # might miss it by falling back to a NULL literal.
                try:
                    rows = conn.execute(
                        """
                        SELECT seq, role, content, created_at, extras
                        FROM messages
                        WHERE session_id = ?
                        ORDER BY seq ASC
                        """,
                        (session_id,),
                    ).fetchall()
                except sqlite3.OperationalError:
                    rows = [
                        (seq, role, content, created_at, "")
                        for (seq, role, content, created_at) in conn.execute(
                            """
                            SELECT seq, role, content, created_at
                            FROM messages
@@ -707,6 +791,7 @@ class ConversationStore:
                            """,
                            (session_id,),
                        ).fetchall()
                    ]
            finally:
                conn.close()
@@ -719,13 +804,16 @@ class ConversationStore:
            include_thinking = False
        # Strip seq for display grouping, but record max seq per visible user group
-        plain_rows = [(role, content, created_at) for _seq, role, content, created_at in rows]
+        plain_rows = [
            (role, content, created_at, extras_raw)
            for _seq, role, content, created_at, extras_raw in rows
        ]
        visible = _group_into_display_turns(plain_rows, include_thinking=include_thinking)
        # Build a mapping: find the seq of each visible user message to annotate context boundary.
        # Walk through rows to find visible user message seqs in order.
        visible_user_seqs: List[int] = []
-        for seq, role, raw_content, _ts in rows:
+        for seq, role, raw_content, _ts, _extras in rows:
            if role != "user":
                continue
            try:
@@ -911,6 +999,18 @@ class ConversationStore:
            except Exception as e:
                logger.warning(f"[ConversationStore] Migration (context_start_seq) failed: {e}")
        msg_cols = {
            row[1]
            for row in conn.execute("PRAGMA table_info(messages)").fetchall()
        }
        if "extras" not in msg_cols:
            try:
                conn.execute(_MIGRATION_ADD_MSG_EXTRAS)
                conn.commit()
                logger.info("[ConversationStore] Migrated: added messages.extras column")
            except Exception as e:
                logger.warning(f"[ConversationStore] Migration (extras) failed: {e}")
    def _connect(self) -> sqlite3.Connection:
        conn = sqlite3.connect(str(self._db_path), timeout=10)
        conn.execute("PRAGMA journal_mode=WAL")
--- a/agent/tools/vision/vision.py
+++ b/agent/tools/vision/vision.py
@@ -3,7 +3,7 @@ Vision tool - Analyze images using Vision API.
 Supports local files (auto base64-encoded) and HTTP URLs.
 Provider resolution:
-  - tool.vision.model (if set) means "prefer this model first; fall back to
+  - tools.vision.model (if set) means "prefer this model first; fall back to
    other configured providers if it fails". The model name is mapped to its
    native provider (e.g. doubao-* → Doubao, kimi-* → Moonshot, gpt-* →
    OpenAI/LinkAI). That provider is tried first, then the standard auto
@@ -60,7 +60,7 @@ _DISCOVERABLE_MODELS = [
 ]
 # Model name prefix → discoverable provider display_name.
-# Used to auto-route tool.vision.model to its native provider.
+# Used to auto-route tools.vision.model to its native provider.
 # Matched case-insensitively; longest prefix wins.
 _MODEL_PREFIX_TO_PROVIDER = [
    ("doubao-", "Doubao"),
@@ -154,7 +154,7 @@ class Vision(BaseTool):
        # Default model is only used as a last-resort placeholder for providers
        # whose VisionProvider.model_override is None (e.g. raw OpenAI provider
-        # when the user did not configure tool.vision.model).
+        # when the user did not configure tools.vision.model).
        return self._call_with_fallback(providers, DEFAULT_MODEL, question, image_content)
    def _call_with_fallback(self, providers: List[VisionProvider], model: str,
@@ -193,12 +193,12 @@ class Vision(BaseTool):
        """
        Build an ordered list of providers to try.
-        Semantics of `tool.vision.model`:
+        Semantics of `tools.vision.model`:
          "Prefer this model first; fall back to other configured providers
           if it fails."
        Order:
-          1. The provider that natively serves `tool.vision.model` (if any
+          1. The provider that natively serves `tools.vision.model` (if any
             and its API key is configured) — using the user-specified model
             name verbatim.
          2. Auto-discovery chain as fallback:
@@ -213,7 +213,7 @@ class Vision(BaseTool):
        user_model = self._resolve_user_vision_model()
        providers: List[VisionProvider] = []
-        # Step 1: preferred provider derived from tool.vision.model
+        # Step 1: preferred provider derived from tools.vision.model
        if user_model:
            preferred = self._route_by_model_name(user_model)
            if preferred:
@@ -251,11 +251,11 @@ class Vision(BaseTool):
    @staticmethod
    def _resolve_user_vision_model() -> Optional[str]:
-        """Read tool.vision.model from config; return None if unset/blank."""
+        """Read tools.vision.model (singular ``tool`` kept as runtime fallback)."""
-        tool_conf = conf().get("tool", {})
+        tools_conf = conf().get("tools") or conf().get("tool") or {}
-        if not isinstance(tool_conf, dict):
+        if not isinstance(tools_conf, dict):
            return None
-        vision_conf = tool_conf.get("vision", {})
+        vision_conf = tools_conf.get("vision", {})
        if not isinstance(vision_conf, dict):
            return None
        m = vision_conf.get("model")
@@ -303,7 +303,7 @@ class Vision(BaseTool):
                self._append_provider(providers, lambda: self._build_linkai_provider(user_model))
            if providers:
                return providers
-            logger.warning(f"[Vision] tool.vision.model='{user_model}' looks like an OpenAI "
+            logger.warning(f"[Vision] tools.vision.model='{user_model}' looks like an OpenAI "
                           f"model but neither OPENAI_API_KEY nor LINKAI_API_KEY is configured.")
            return None  # fall through to auto
@@ -317,7 +317,7 @@ class Vision(BaseTool):
                continue
            api_key = conf().get(config_key, "")
            if not api_key or not api_key.strip():
-                logger.warning(f"[Vision] tool.vision.model='{user_model}' routes to "
+                logger.warning(f"[Vision] tools.vision.model='{user_model}' routes to "
                               f"'{display_name}' but '{config_key}' is not configured. "
                               f"Falling back to auto-discovery.")
                return None  # fall through to auto
@@ -452,8 +452,8 @@ class Vision(BaseTool):
        if not self._main_bot_supports_vision(bot):
            return None
-        # Use the configured main model name; do NOT inject tool.vision.model
+        # Use the configured main model name; do NOT inject tools.vision.model
-        # here, because by the time we reach this branch the tool.vision.model
+        # here, because by the time we reach this branch the tools.vision.model
        # routing has already been attempted (and either matched the main bot
        # or failed to find a provider).
        main_model_name = conf().get("model") or None
--- a/channel/chat_channel.py
+++ b/channel/chat_channel.py
@@ -171,7 +171,13 @@ class ChatChannel(Channel):
            if "desire_rtype" not in context and conf().get("always_reply_voice") and ReplyType.VOICE not in self.NOT_SUPPORT_REPLYTYPE:
                context["desire_rtype"] = ReplyType.VOICE
        elif context.type == ContextType.VOICE:
-            if "desire_rtype" not in context and conf().get("voice_reply_voice") and ReplyType.VOICE not in self.NOT_SUPPORT_REPLYTYPE:
+            # Voice input replies with voice when either voice_reply_voice
            # (mirror voice) or the global always_reply_voice toggle is on.
            if (
                "desire_rtype" not in context
                and (conf().get("voice_reply_voice") or conf().get("always_reply_voice"))
                and ReplyType.VOICE not in self.NOT_SUPPORT_REPLYTYPE
            ):
                context["desire_rtype"] = ReplyType.VOICE
        return context
--- a/channel/feishu/feishu_channel.py
+++ b/channel/feishu/feishu_channel.py
@@ -1515,10 +1515,16 @@ class FeiShuChanel(ChatChannel):
            else:
                context.type = ContextType.TEXT
            context.content = content.strip()
            # Text input opts into voice replies only when the always-on toggle is set.
            if "desire_rtype" not in context and conf().get("always_reply_voice"):
                context["desire_rtype"] = ReplyType.VOICE
        elif context.type == ContextType.VOICE:
-            # 2.语音请求
+            # 2.语音请求: voice input replies with voice if either
-            if "desire_rtype" not in context and conf().get("voice_reply_voice"):
+            # voice_reply_voice (mirror reply) or always_reply_voice is on.
            if "desire_rtype" not in context and (
                conf().get("voice_reply_voice") or conf().get("always_reply_voice")
            ):
                context["desire_rtype"] = ReplyType.VOICE
        return context
--- a/channel/web/static/css/console.css
+++ b/channel/web/static/css/console.css
@@ -1294,3 +1294,76 @@
    overflow: hidden;
    min-height: 2.5em;  /* ~2 lines at text-sm leading-relaxed */
 }
 /* --------------------------------------------------------------------
 * Voice pill — compact custom audio player used by mic uploads and TTS
 * replies. Replaces the bulky native <audio controls> with a play/pause
 * icon + thin progress bar + duration counter so it blends into chat
 * bubbles without the chrome-grey browser default look.
 * ------------------------------------------------------------------ */
 .voice-pill {
    display: inline-flex;
    align-items: center;
    gap: 8px;
    padding: 6px 10px;
    border-radius: 999px;
    background: rgba(15, 23, 42, 0.05);
    color: rgb(71, 85, 105);
    font-size: 12px;
    line-height: 1;
    max-width: 240px;
    user-select: none;
    cursor: default;
 }
 .dark .voice-pill {
    background: rgba(255, 255, 255, 0.08);
    color: rgb(203, 213, 225);
 }
 .voice-pill[data-loading="1"] {
    opacity: 0.65;
 }
 .voice-pill-btn {
    width: 22px;
    height: 22px;
    border-radius: 999px;
    display: inline-flex;
    align-items: center;
    justify-content: center;
    background: var(--color-primary-500, #2563eb);
    color: #fff;
    flex-shrink: 0;
    cursor: pointer;
    transition: transform 0.1s ease;
 }
 .voice-pill-btn:hover { transform: scale(1.05); }
 .voice-pill-btn i { font-size: 9px; margin-left: 1px; }
 .voice-pill-btn[data-state="play"] i { margin-left: 2px; }
 .voice-pill-btn[data-state="pause"] i { margin-left: 0; }
 .voice-pill-track {
    flex: 1;
    height: 3px;
    border-radius: 999px;
    background: rgba(100, 116, 139, 0.25);
    overflow: hidden;
    min-width: 70px;
 }
 .dark .voice-pill-track {
    background: rgba(148, 163, 184, 0.25);
 }
 .voice-pill-fill {
    height: 100%;
    width: 0%;
    background: var(--color-primary-500, #2563eb);
    border-radius: inherit;
    transition: width 0.1s linear;
 }
 .voice-pill-time {
    font-variant-numeric: tabular-nums;
    font-size: 11px;
    color: inherit;
    opacity: 0.75;
    flex-shrink: 0;
    min-width: 28px;
    text-align: right;
 }
 .voice-pill audio { display: none; }
--- a/channel/web/static/js/console.js
+++ b/channel/web/static/js/console.js
@@ -25,6 +25,7 @@ const I18N = {
        models_add_vendor: '添加厂商',
        models_provider: '厂商',
        models_model: '模型',
        models_voice: '音色',
        models_configured: '已配置',
        models_not_configured: '未配置',
        models_pick_to_configure: '选择以配置',
@@ -160,6 +161,11 @@ const I18N = {
        mic_permission_denied: '无法访问麦克风，请检查浏览器权限',
        mic_too_short: '录音太短，请重试',
        mic_error: '语音识别失败',
        speak_msg: '朗读这段回复',
        voice_reply_mode_label: '语音回复策略',
        voice_reply_off: '关闭',
        voice_reply_if_voice: '仅语音问/语音答',
        voice_reply_always: '总是语音回复',
        attach_menu_folder: '上传文件夹',
        confirm_yes: '确认',
        confirm_cancel: '取消',
@@ -180,6 +186,7 @@ const I18N = {
        models_add_vendor: 'Add Vendor',
        models_provider: 'Provider',
        models_model: 'Model',
        models_voice: 'Voice',
        models_configured: 'configured',
        models_not_configured: 'not configured',
        models_pick_to_configure: 'pick to configure',
@@ -315,6 +322,11 @@ const I18N = {
        mic_permission_denied: 'Cannot access microphone — check browser permissions',
        mic_too_short: 'Recording too short, please retry',
        mic_error: 'Speech recognition failed',
        speak_msg: 'Read this reply aloud',
        voice_reply_mode_label: 'Voice reply policy',
        voice_reply_off: 'Off',
        voice_reply_if_voice: 'Voice only if voice input',
        voice_reply_always: 'Always reply with voice',
        attach_menu_folder: 'Upload Folder',
        confirm_yes: 'Confirm',
        confirm_cancel: 'Cancel',
@@ -1474,6 +1486,7 @@ function sendVoiceMessage(text, audioUrl) {
        message: text,
        stream: true,
        timestamp: timestamp.toISOString(),
        is_voice: true,
    };
    const MAX_RETRIES = 2;
@@ -1512,19 +1525,19 @@ function sendVoiceMessage(text, audioUrl) {
 function addUserVoiceMessage(audioUrl, caption, timestamp) {
    const el = document.createElement('div');
    el.className = 'flex justify-end px-4 sm:px-6 py-3';
-    // Voice-message bubble: playable <audio> on top, ASR caption beneath.
+    // Voice-message bubble: compact voice pill on top, ASR caption beneath.
    // The bubble keeps the same primary tint as a normal user message so
    // it visually slots into the conversation flow.
    el.innerHTML = `
        <div class="max-w-[75%] sm:max-w-[60%]">
            <div class="bg-slate-100 dark:bg-white/10 text-slate-700 dark:text-slate-200 rounded-2xl px-3 py-2 msg-content user-bubble">
-                <audio controls preload="metadata" src="${audioUrl}"
+                <div class="user-voice-slot"></div>
                       class="block w-[260px] max-w-full h-9"></audio>
                ${caption ? `<div class="text-xs mt-1.5 leading-snug text-slate-500 dark:text-slate-400 whitespace-pre-wrap break-words">${escapeHtml(caption)}</div>` : ''}
            </div>
            <div class="text-xs text-slate-400 dark:text-slate-500 mt-1.5 text-right">${formatTime(timestamp)}</div>
        </div>
    `;
    el.querySelector('.user-voice-slot').appendChild(renderVoicePill(audioUrl));
    messagesDiv.appendChild(el);
    _autoScrollEnabled = true;
    scrollChatToBottom(true);
@@ -1639,12 +1652,16 @@ function startSSE(requestId, loadingEl, timestamp, titleInfo) {
                    <div class="agent-steps"></div>
                    <div class="answer-content sse-streaming"></div>
                    <div class="media-content"></div>
                    <div class="bot-audio-slot"></div>
                </div>
                <div class="flex items-center gap-2 mt-1.5">
                    <span class="text-xs text-slate-400 dark:text-slate-500">${formatTime(timestamp)}</span>
                    <button class="copy-msg-btn text-xs text-slate-300 dark:text-slate-600 hover:text-slate-500 dark:hover:text-slate-400 transition-colors cursor-pointer" title="${currentLang === 'zh' ? '复制' : 'Copy'}" style="display:none">
                        <i class="fas fa-copy"></i>
                    </button>
                    <button class="speak-msg-btn text-xs text-slate-300 dark:text-slate-600 hover:text-slate-500 dark:hover:text-slate-400 transition-colors cursor-pointer" title="${t('speak_msg')}" style="display:none;">
                        <i class="fas fa-volume-up"></i>
                    </button>
                </div>
            </div>
        `;
@@ -1856,11 +1873,12 @@ function startSSE(requestId, loadingEl, timestamp, titleInfo) {
                scrollChatToBottom();
            } else if (item.type === 'done') {
                // Don't close the stream yet: the backend keeps it open
                // for a short tail to deliver async attachments such as
                // TTS audio (`voice_attach`). It will close the stream on
                // its own via onerror once the tail expires.
                done = true;
                es.close();
                delete activeStreams[requestId];
                // item.content may be empty when "done" is only a stream-close signal after media.
                const finalText = item.content || accumulatedText;
                if (!botEl && finalText) {
@@ -1874,6 +1892,7 @@ function startSSE(requestId, loadingEl, timestamp, titleInfo) {
                    if (copyBtn && finalText) copyBtn.style.display = '';
                    applyHighlighting(botEl);
                }
                renderBotSpeakerButton(botEl, finalText);
                scrollChatToBottom();
                if (titleInfo) {
@@ -1883,6 +1902,15 @@ function startSSE(requestId, loadingEl, timestamp, titleInfo) {
                    loadSessionList();
                }
            } else if (item.type === 'voice_attach') {
                // TTS finished — attach a playable audio element to the
                // current bot bubble. The stream closes right after.
                if (botEl && item.url) {
                    attachAudioToBotBubble(botEl, item.url, { autoplay: true });
                }
                es.close();
                delete activeStreams[requestId];
            } else if (item.type === 'error') {
                done = true;
                es.close();
@@ -1896,7 +1924,10 @@ function startSSE(requestId, loadingEl, timestamp, titleInfo) {
            es.close();
            delete activeStreams[requestId];
-            if (done) return;
+            if (done) {
                // Normal close after the post-done tail expired; nothing to do.
                return;
            }
            if (currentReasoningEl) {
                finalizeThinking(currentReasoningEl, reasoningStartTime, reasoningText);
@@ -2187,21 +2218,174 @@ function createBotMessageEl(content, timestamp, requestId, msg) {
            <div class="bg-white dark:bg-[#1A1A1A] border border-slate-200 dark:border-white/10 rounded-2xl px-4 py-3 text-sm leading-relaxed msg-content text-slate-700 dark:text-slate-200">
                ${stepsHtml ? `<div class="agent-steps">${stepsHtml}</div>` : ''}
                <div class="answer-content">${renderMarkdown(displayContent)}</div>
                <div class="bot-audio-slot"></div>
            </div>
            <div class="flex items-center gap-2 mt-1.5">
                <span class="text-xs text-slate-400 dark:text-slate-500">${formatTime(timestamp)}</span>
                <button class="copy-msg-btn text-xs text-slate-300 dark:text-slate-600 hover:text-slate-500 dark:hover:text-slate-400 transition-colors cursor-pointer" title="${currentLang === 'zh' ? '复制' : 'Copy'}">
                    <i class="fas fa-copy"></i>
                </button>
                <button class="speak-msg-btn text-xs text-slate-300 dark:text-slate-600 hover:text-slate-500 dark:hover:text-slate-400 transition-colors cursor-pointer" title="${t('speak_msg')}" style="display:none;">
                    <i class="fas fa-volume-up"></i>
                </button>
            </div>
        </div>
    `;
    el.querySelector('.answer-content').dataset.rawMd = displayContent;
    // Existing TTS attachment (history replay): mount the player up-front.
    const existingAudio = msg && msg.extras && msg.extras.audio && msg.extras.audio.url;
    if (existingAudio) {
        attachAudioToBotBubble(el, existingAudio, { autoplay: false });
    }
    renderBotSpeakerButton(el, displayContent);
    applyHighlighting(el);
    bindChatKnowledgeLinks(el);
    return el;
 }
 // Append (or replace) a small audio player inside a bot bubble's
 // dedicated `.bot-audio-slot`. Used by both live TTS pushes and history
 // replay. Silent failures: never throws.
 function attachAudioToBotBubble(botEl, audioUrl, opts) {
    try {
        if (!botEl || !audioUrl) return;
        const slot = botEl.querySelector('.bot-audio-slot');
        if (!slot) return;
        slot.innerHTML = '';
        slot.style.marginTop = '6px';
        const pill = renderVoicePill(audioUrl, { autoplay: !!(opts && opts.autoplay) });
        slot.appendChild(pill);
        const speakBtn = botEl.querySelector('.speak-msg-btn');
        if (speakBtn) speakBtn.style.display = 'none';
    } catch (_) { /* silent */ }
 }
 // Build a compact play/pause + progress + duration pill that wraps a
 // hidden <audio>. Returns the root element; safe to embed anywhere.
 function renderVoicePill(audioUrl, opts) {
    opts = opts || {};
    const wrap = document.createElement('div');
    wrap.className = 'voice-pill';
    wrap.innerHTML = `
        <button type="button" class="voice-pill-btn" data-state="play" aria-label="play">
            <i class="fas fa-play"></i>
        </button>
        <div class="voice-pill-track"><div class="voice-pill-fill"></div></div>
        <span class="voice-pill-time">0:00</span>
        <audio preload="metadata" src="${audioUrl}"></audio>
    `;
    const btn = wrap.querySelector('.voice-pill-btn');
    const fill = wrap.querySelector('.voice-pill-fill');
    const timeEl = wrap.querySelector('.voice-pill-time');
    const audio = wrap.querySelector('audio');
    const fmt = (s) => {
        if (!isFinite(s) || s < 0) s = 0;
        const m = Math.floor(s / 60);
        const r = Math.floor(s % 60);
        return `${m}:${r < 10 ? '0' : ''}${r}`;
    };
    const setIcon = (state) => {
        btn.dataset.state = state;
        btn.querySelector('i').className = state === 'pause' ? 'fas fa-pause' : 'fas fa-play';
        btn.setAttribute('aria-label', state === 'pause' ? 'pause' : 'play');
    };
    audio.addEventListener('loadedmetadata', () => {
        if (audio.duration && isFinite(audio.duration)) timeEl.textContent = fmt(audio.duration);
    });
    audio.addEventListener('timeupdate', () => {
        const dur = audio.duration || 0;
        if (dur > 0) {
            fill.style.width = `${Math.min(100, (audio.currentTime / dur) * 100)}%`;
            timeEl.textContent = fmt(dur - audio.currentTime);
        }
    });
    audio.addEventListener('ended', () => {
        setIcon('play');
        fill.style.width = '0%';
        timeEl.textContent = fmt(audio.duration || 0);
    });
    audio.addEventListener('play',  () => setIcon('pause'));
    audio.addEventListener('pause', () => setIcon('play'));
    btn.addEventListener('click', (e) => {
        e.stopPropagation();
        if (audio.paused) {
            audio.play().catch(() => {});
        } else {
            audio.pause();
        }
    });
    if (opts.autoplay) {
        // Autoplay may be blocked by the browser; fall back silently and
        // let the user tap the play button.
        const tryPlay = () => audio.play().catch(() => {});
        if (audio.readyState >= 2) tryPlay();
        else audio.addEventListener('canplay', tryPlay, { once: true });
    }
    return wrap;
 }
 // Show the manual "read aloud" button when TTS is configured but the
 // bubble has no audio yet. Lazily probes capability via /api/models so
 // we don't expose the button when nothing can synthesize speech.
 function renderBotSpeakerButton(botEl, text) {
    if (!botEl || !text || !text.trim()) return;
    const btn = botEl.querySelector('.speak-msg-btn');
    if (!btn) return;
    if (botEl.querySelector('.bot-audio-slot audio')) return;
    _isTtsReady().then(ready => {
        if (!ready) return;
        btn.style.display = '';
        btn.onclick = () => _triggerManualTts(btn, botEl, text);
    });
 }
 let _ttsReadyPromise = null;
 let _ttsReadyTs = 0;
 function _isTtsReady() {
    // Cache for 30s to avoid hammering /api/models on every bubble.
    if (_ttsReadyPromise && Date.now() - _ttsReadyTs < 30000) {
        return _ttsReadyPromise;
    }
    _ttsReadyTs = Date.now();
    _ttsReadyPromise = fetch('/api/models')
        .then(r => r.json())
        .then(data => {
            const tts = data && data.capabilities && data.capabilities.tts;
            if (!tts) return false;
            return Boolean(tts.current_provider || tts.suggested_provider);
        })
        .catch(() => false);
    return _ttsReadyPromise;
 }
 function _triggerManualTts(btn, botEl, text) {
    if (btn.dataset.busy === '1') return;
    btn.dataset.busy = '1';
    const icon = btn.querySelector('i');
    const prev = icon ? icon.className : '';
    if (icon) icon.className = 'fas fa-spinner fa-spin';
    fetch('/api/voice/tts', {
        method: 'POST',
        headers: { 'Content-Type': 'application/json' },
        body: JSON.stringify({ text, session_id: sessionId }),
    })
        .then(r => r.json())
        .then(data => {
            if (data && data.status === 'success' && data.audio_url) {
                attachAudioToBotBubble(botEl, data.audio_url, { autoplay: true });
            }
        })
        .catch(() => {})
        .finally(() => {
            btn.dataset.busy = '0';
            if (icon) icon.className = prev || 'fas fa-volume-up';
        });
 }
 function addUserMessage(content, timestamp, attachments) {
    const el = createUserMessageEl(content, timestamp, attachments);
    messagesDiv.appendChild(el);
@@ -3842,14 +4026,39 @@ function renderCapabilityBody(def, cap, body) {
    body.innerHTML = providerHtml + modelHtml + dimHtml + footer;
-    // The body subtree is detached from `document` at this moment (the parent
+    // TTS: mount reply-mode above provider; defer off-mode toggle to the end.
-    // wrap is not yet appended), so we must scope lookups to `body` rather
+    if (def.id === 'tts') {
-    // than calling document.getElementById, which would return null and crash
+        renderVoiceReplyMode(body, cap.reply_mode || 'off', { skipVisibilityToggle: true });
-    // initDropdown's internal querySelector.
+        // Voice-timbre picker depends on provider+model; rebuilt by callbacks.
        const modelWrap = body.querySelector(`#cap-${def.id}-model-wrap`);
        if (modelWrap) {
            const voiceWrap = document.createElement('div');
            voiceWrap.id = `cap-${def.id}-voice-wrap`;
            voiceWrap.innerHTML = `
                <label class="block text-sm font-medium text-slate-600 dark:text-slate-400 mb-1.5">${t('models_voice')}</label>
                <div id="cap-${def.id}-voice" class="cfg-dropdown" tabindex="0">
                    <div class="cfg-dropdown-selected">
                        <span class="cfg-dropdown-text">--</span>
                        <i class="fas fa-chevron-down cfg-dropdown-arrow"></i>
                    </div>
                    <div class="cfg-dropdown-menu"></div>
                </div>
                <div id="cap-${def.id}-voice-custom-wrap" class="hidden mt-2">
                    <input id="cap-${def.id}-voice-custom" type="text"
                           class="w-full px-3 py-2 text-sm rounded-md border border-slate-200 dark:border-slate-700
                                  bg-white dark:bg-slate-800 text-slate-700 dark:text-slate-200
                                  placeholder:text-slate-400 dark:placeholder:text-slate-500
                                  focus:outline-none focus:ring-2 focus:ring-primary-500"
                           placeholder="voice id" />
                </div>
            `;
            modelWrap.parentNode.insertBefore(voiceWrap, modelWrap.nextSibling);
        }
    }
    // `body` is still detached from `document`; scope lookups locally.
    const provDd = body.querySelector(`#cap-${def.id}-provider`);
-    // initDropdown's option shape is {value, label}; we strip our private
+    // Strip private fields before handing to the generic initDropdown helper.
    // _configured/_tracked fields before handing it over so the helper stays
    // generic, then re-attach status decorations afterwards.
    const ddOpts = providerOpts.map(o => ({ value: o.value, label: o.label }));
    let pendingProvider = null;
@@ -3860,15 +4069,9 @@ function renderCapabilityBody(def, cap, body) {
        pendingCapabilitySelection = null;
    }
-    // For auto-capable capabilities, an "auto" strategy means the user has
+    // Auto strategy => leave empty sentinel selected. `suggested_provider`
-    // not pinned a vendor; we honor that by selecting the empty-string
+    // is a UI-only preselect (not persisted until the user clicks Save).
-    // sentinel rather than the resolved fallback provider name.
+    // No current + no suggestion => leave unselected with a placeholder.
    // `suggested_provider` is a UI-only preselect (used by embedding & ASR)
    // when the user has not pinned a vendor yet — purely cosmetic, not
    // persisted until the user clicks Save.
    // For "pick or empty" capabilities (no current, no suggestion), we leave
    // the dropdown unselected and show a muted placeholder so the user is
    // nudged to pick explicitly.
    const noSelectionAndNoHint = !cap.current_provider && !cap.suggested_provider;
    const initialProviderValue = pendingProvider
        ? pendingProvider
@@ -3889,20 +4092,82 @@ function renderCapabilityBody(def, cap, body) {
    if (def.needsModel) {
        rebuildCapabilityModelDropdown(def, initialProviderValue, cap.current_model || '', body);
-        // Hide the model picker entirely while the capability is in `auto`
+        // Hide model picker in auto mode — fallback hint below covers it.
        // mode — there is nothing useful to pin, and the fallback hint
        // below explains what'll actually run.
        setCapabilityModelPickerVisible(def, initialProviderValue !== '' || !capabilitySupportsAuto(def.id), body);
    }
    if (def.id === 'tts') {
        rebuildCapabilityVoiceDropdown(
            initialProviderValue,
            cap.current_voice || '',
            body,
            cap.current_model || ''
        );
    }
    // Inject auto/router-pending hint banners before the action footer.
    renderCapabilityHints(def, cap, body, initialProviderValue);
    if (def.id === 'tts') {
        _setTtsConfigVisible(body, (cap.reply_mode || 'off') !== 'off');
    }
 }
-// Toggle visibility of the model picker. Used both at first render and
+// TTS reply-policy dropdown (off / voice_if_voice / always). Persists on
-// whenever the provider dropdown swings between an explicit vendor and the
+// change. When off, hides the rest of the TTS card.
-// "auto" sentinel. We toggle the wrapper rather than re-rendering so the
+function renderVoiceReplyMode(host, currentMode, options) {
-// existing dropdown state survives a round-trip back to a real vendor.
+    options = options || {};
    const opts = [
        { value: 'off',            label: t('voice_reply_off') },
        { value: 'voice_if_voice', label: t('voice_reply_if_voice') },
        { value: 'always',         label: t('voice_reply_always') },
    ];
    const wrap = document.createElement('div');
    wrap.id = 'voice-reply-mode-wrap';
    wrap.innerHTML = `
        <label class="block text-sm font-medium text-slate-600 dark:text-slate-400 mb-1.5">${t('voice_reply_mode_label')}</label>
        <div id="voice-reply-mode-dd" class="cfg-dropdown" tabindex="0">
            <div class="cfg-dropdown-selected">
                <span class="cfg-dropdown-text">--</span>
                <i class="fas fa-chevron-down cfg-dropdown-arrow"></i>
            </div>
            <div class="cfg-dropdown-menu"></div>
        </div>
    `;
    host.prepend(wrap);
    const dd = wrap.querySelector('#voice-reply-mode-dd');
    const valid = ['off', 'voice_if_voice', 'always'];
    const initial = valid.includes(currentMode) ? currentMode : 'off';
    if (!options.skipVisibilityToggle) _setTtsConfigVisible(host, initial !== 'off');
    initDropdown(dd, opts, initial, (mode) => {
        if (!valid.includes(mode)) return;
        _setTtsConfigVisible(host, mode !== 'off');
        fetch('/api/models', {
            method: 'POST',
            headers: { 'Content-Type': 'application/json' },
            body: JSON.stringify({ action: 'set_voice_reply_mode', mode }),
        })
            .then(r => r.json())
            .then(data => {
                if (data && data.status === 'success') {
                    _ttsReadyPromise = null;  // force re-probe on next bubble
                }
            })
            .catch(() => {});
    });
 }
 // Show/hide everything in the TTS card below the reply-mode dropdown.
 function _setTtsConfigVisible(host, visible) {
    if (!host) return;
    Array.from(host.children).forEach((child) => {
        if (child.id === 'voice-reply-mode-wrap') return;
        child.classList.toggle('hidden', !visible);
    });
 }
 // Toggle wrapper visibility instead of re-rendering so dropdown state survives.
 function setCapabilityModelPickerVisible(def, visible, scope) {
    const root = scope || document;
    const wrap = root.querySelector(`#cap-${def.id}-model-wrap`);
@@ -4135,7 +4400,7 @@ function rebuildCapabilityModelDropdown(def, providerId, selectedModel, scope) {
    initDropdown(el, opts, initialValue, (value) => {
        const customWrap = document.getElementById(`cap-${def.id}-model-custom-wrap`);
-        if (!customWrap) return;
+        if (customWrap) {
            if (value === '__custom__') {
                customWrap.classList.remove('hidden');
                const input = document.getElementById(`cap-${def.id}-model-custom`);
@@ -4143,6 +4408,14 @@ function rebuildCapabilityModelDropdown(def, providerId, selectedModel, scope) {
            } else {
                customWrap.classList.add('hidden');
            }
        }
        // TTS voice catalog may be scoped per engine model (aggregating
        // gateways). Rebuild the voice picker whenever the model changes.
        if (def.id === 'tts') {
            const provDd = document.getElementById('cap-tts-provider');
            const provId = provDd ? getDropdownValue(provDd) : '';
            rebuildCapabilityVoiceDropdown(provId, '', null, value);
        }
    });
    const customWrap = root.querySelector(`#cap-${def.id}-model-custom-wrap`);
@@ -4157,22 +4430,93 @@ function rebuildCapabilityModelDropdown(def, providerId, selectedModel, scope) {
    }
 }
 // TTS-only: rebuild the voice timbre picker against the provider's
 // curated voice list. Hidden when no provider is picked.
 //
 // Each voice entry may be:
 //   - a bare string  (code = label)
 //   - {value, label, hint?}   so we can show a friendly Chinese name
 //     while persisting the raw API code that the runtime sends.
 function rebuildCapabilityVoiceDropdown(providerId, selectedVoice, scope, modelId) {
    const root = scope || document;
    const wrap = root.querySelector(`#cap-tts-voice-wrap`);
    const el = root.querySelector(`#cap-tts-voice`);
    if (!wrap || !el) return;
    const cap = modelsState.capabilities.tts || {};
    const voicesByProvider = cap.provider_voices || {};
    let raw = (providerId && voicesByProvider[providerId]) || [];
    // Some providers (gateways) scope voices by engine model id.
    if (raw && !Array.isArray(raw) && typeof raw === 'object') {
        const activeModel = modelId
            || (root.querySelector(`#cap-tts-model`) ? getDropdownValue(root.querySelector(`#cap-tts-model`)) : '');
        raw = (activeModel && raw[activeModel]) || [];
    }
    if (!raw || raw.length === 0) {
        wrap.classList.add('hidden');
        return;
    }
    wrap.classList.remove('hidden');
    // Voice picker: friendly name on the left, raw API code as right-hand
    // hint. Persisted/sent value is always the raw code.
    const codes = [];
    const opts = raw.map(entry => {
        if (typeof entry === 'string') {
            codes.push(entry);
            return { value: entry, label: entry };
        }
        codes.push(entry.value);
        const code = entry.value;
        const desc = entry.hint || entry.label || code;
        return {
            value: code,
            label: desc,
            hint: desc === code ? '' : code,
        };
    });
    opts.push({ value: '__custom__', label: currentLang === 'zh' ? '自定义...' : 'Custom...' });
    // Off-catalog values route through the custom branch.
    let initial = selectedVoice || '';
    const isCustom = initial && !codes.includes(initial);
    if (isCustom) initial = '__custom__';
    if (!initial) initial = codes[0];
    initDropdown(el, opts, initial, (value) => {
        const customWrap = root.querySelector(`#cap-tts-voice-custom-wrap`);
        if (!customWrap) return;
        if (value === '__custom__') {
            customWrap.classList.remove('hidden');
            const input = root.querySelector(`#cap-tts-voice-custom`);
            if (input && !input.value) input.value = isCustom ? selectedVoice : '';
        } else {
            customWrap.classList.add('hidden');
        }
    });
    const customWrap = root.querySelector(`#cap-tts-voice-custom-wrap`);
    if (customWrap) {
        if (initial === '__custom__') {
            customWrap.classList.remove('hidden');
            const input = root.querySelector(`#cap-tts-voice-custom`);
            if (input) input.value = isCustom ? selectedVoice : '';
        } else {
            customWrap.classList.add('hidden');
        }
    }
 }
 function onCapabilityProviderChange(def, providerId, scope) {
    if (def.needsModel) {
-        // For capabilities that support `auto`, switching to the empty
+        // Empty sentinel hides the model picker (capability is in auto mode).
        // sentinel hides the model picker entirely so the card reads as
        // "we'll figure it out"; switching back to a real vendor re-runs
        // the rebuild against the capability-scoped model list.
        const isAuto = providerId === '' && capabilitySupportsAuto(def.id);
        if (!isAuto) {
            rebuildCapabilityModelDropdown(def, providerId, '', scope);
        }
        setCapabilityModelPickerVisible(def, !isAuto, scope);
    }
-    // Refresh the auto-hint so it disappears once the user pins a vendor
+    if (def.id === 'tts') {
-    // and reappears when they swing back to "auto". renderCapabilityHints
+        rebuildCapabilityVoiceDropdown(providerId, '', scope);
-    // now writes directly into the footer's hint slot, so we just call it
+    }
    // again — no need to clean up stale DOM nodes.
    const body = scope || document.querySelector(`[data-cap-body="${def.id}"]`);
    if (body) {
        const cap = modelsState.capabilities[def.id] || {};
@@ -4202,6 +4546,16 @@ function saveCapability(capId) {
    // the backend treats this as "fall back to the runtime chain".
    const isAuto = provider === '' && capabilitySupportsAuto(capId);
    const model = isAuto ? '' : getCapabilityModelValue(def);
    // TTS carries an extra voice timbre (supports free-text custom ids).
    let voice = '';
    if (capId === 'tts' && !isAuto) {
        const voiceDd = document.getElementById(`cap-${capId}-voice`);
        voice = voiceDd ? getDropdownValue(voiceDd) : '';
        if (voice === '__custom__') {
            const input = document.getElementById(`cap-${capId}-voice-custom`);
            voice = input ? input.value.trim() : '';
        }
    }
    // Embedding changes invalidate any pre-existing vector index because
    // dimensions / vendor differ. Gate the save behind a confirm, and on
@@ -4243,19 +4597,19 @@ function saveCapability(capId) {
            return;
        }
    }
-    _persistCapability(capId, provider, model);
+    _persistCapability(capId, provider, model, undefined, { voice });
 }
-function _persistCapability(capId, provider, model, onAfterSuccess) {
+function _persistCapability(capId, provider, model, onAfterSuccess, extras) {
    const payload = { action: 'set_capability', capability: capId, provider_id: provider, model: model };
    if (extras && extras.voice !== undefined) payload.voice = extras.voice;
    fetch('/api/models', {
        method: 'POST',
        headers: { 'Content-Type': 'application/json' },
-        body: JSON.stringify({ action: 'set_capability', capability: capId, provider_id: provider, model: model }),
+        body: JSON.stringify(payload),
    }).then(r => r.json()).then(data => {
        if (data.status === 'success') {
-            // Show "Saved" first, then refresh — loadModelsView would
+            // Flash "Saved" before reload so the status survives the rebuild.
            // otherwise rebuild the card and wipe the status span before
            // the user can register the confirmation.
            showStatus(`cap-${capId}-status`, 'models_save_success', false);
            setTimeout(() => {
                loadModelsView({ preserveScroll: true });
--- a/channel/web/web_channel.py
+++ b/channel/web/web_channel.py
@@ -6,6 +6,7 @@ import logging
 import mimetypes
 import os
 import random
 import shutil
 import threading
 import time
 import uuid
@@ -295,6 +296,12 @@ class WebChannel(ChatChannel):
                    "timestamp": time.time()
                })
                logger.debug(f"SSE done sent for request {request_id}")
                # Auto-trigger TTS once the bot finishes its text reply. The
                # synthesis runs in the background so the chat stream is never
                # blocked; the resulting audio URL is pushed via a follow-up
                # `voice_attach` SSE event and persisted to messages.extras.
                if reply.type == ReplyType.TEXT and content.strip():
                    self._maybe_dispatch_auto_tts(request_id, session_id, content, context)
                return
            # Fallback: polling mode
@@ -461,16 +468,133 @@ class WebChannel(ChatChannel):
        return on_event
    # ------------------------------------------------------------------
    # TTS auto-dispatch
    # ------------------------------------------------------------------
    @staticmethod
    def _resolve_voice_reply_mode() -> str:
        """
        Decide the TTS auto-reply policy.
        Source of truth is the cross-channel pair
        (`always_reply_voice`, `voice_reply_voice`) which chat_channel
        also consults. The web UI presents these as a single three-state
        picker (off / voice_if_voice / always) via a lossless mapping.
        """
        if conf().get("always_reply_voice", False):
            return "always"
        if conf().get("voice_reply_voice", False):
            return "voice_if_voice"
        return "off"
    # Mirror of ModelsHandler._TTS_PROVIDERS. zhipu is intentionally omitted
    # from the UI (GLM-TTS prelude beep); pinning it in config.json still works.
    _TTS_PROVIDERS_SUGGEST_ORDER = ["openai", "minimax", "dashscope", "linkai"]
    @classmethod
    def _tts_provider_ready(cls) -> bool:
        """True if user picked a provider OR any suggested vendor has an API key."""
        if (conf().get("text_to_voice") or "").strip():
            return True
        for pid in cls._TTS_PROVIDERS_SUGGEST_ORDER:
            meta = ConfigHandler.PROVIDER_MODELS.get(pid) or {}
            key_field = meta.get("api_key_field")
            if not key_field:
                continue
            val = (conf().get(key_field) or "").strip()
            if val and val not in ("YOUR API KEY", "YOUR_API_KEY"):
                return True
        return False
    def _maybe_dispatch_auto_tts(
        self,
        request_id: str,
        session_id: str,
        text: str,
        context: dict,
    ) -> None:
        try:
            mode = self._resolve_voice_reply_mode()
            if mode == "off":
                return
            if mode == "voice_if_voice" and not context.get("is_voice_input"):
                return
            if not self._tts_provider_ready():
                return
            threading.Thread(
                target=self._synthesize_tts_async,
                args=(request_id, session_id, text),
                daemon=True,
            ).start()
        except Exception as e:
            logger.debug(f"[WebChannel] auto-tts dispatch skipped: {e}")
    def _synthesize_tts_async(
        self,
        request_id: str,
        session_id: str,
        text: str,
    ) -> None:
        try:
            from bridge.bridge import Bridge
            reply = Bridge().fetch_text_to_voice(text)
            if reply is None or reply.type != ReplyType.VOICE or not reply.content:
                logger.warning(
                    f"[WebChannel] TTS produced no audio for request {request_id}: "
                    f"reply={reply}"
                )
                return
            url = self._publish_tts_audio(reply.content)
            if not url:
                logger.warning(f"[WebChannel] TTS publish failed for request {request_id}")
                return
            payload = {"audio": {"url": url, "kind": "tts"}}
            try:
                from agent.memory import get_conversation_store
                get_conversation_store().attach_extras_to_last_assistant(session_id, payload)
            except Exception as e:
                logger.debug(f"[WebChannel] tts persist skipped: {e}")
            q = self.sse_queues.get(request_id)
            if q is None:
                logger.warning(
                    f"[WebChannel] TTS ready but SSE queue already closed "
                    f"for request {request_id} (url={url})"
                )
                return
            q.put({
                "type": "voice_attach",
                "url": url,
                "request_id": request_id,
                "timestamp": time.time(),
            })
            logger.info(f"[WebChannel] TTS voice_attach pushed for request {request_id}: {url}")
        except Exception as e:
            # TTS failures are intentionally silent (no user-facing error).
            logger.warning(f"[WebChannel] TTS synthesis failed: {e}")
    @staticmethod
    def _publish_tts_audio(src_path: str) -> str:
        """Move a TTS file into uploads/ and return its public URL."""
        try:
            if not src_path or not os.path.isfile(src_path):
                logger.warning(f"[WebChannel] publish_tts_audio missing source: {src_path!r}")
                return ""
            ext = os.path.splitext(src_path)[1].lower() or ".mp3"
            upload_dir = _get_upload_dir()
            os.makedirs(upload_dir, exist_ok=True)
            ts = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
            dst_name = f"voice_reply_{ts}_{random.randint(0, 9999)}{ext}"
            dst_path = os.path.join(upload_dir, dst_name)
            shutil.move(src_path, dst_path)
            logger.debug(f"[WebChannel] publish_tts_audio moved {src_path} -> {dst_path}")
            return f"/uploads/{dst_name}"
        except Exception as e:
            logger.warning(f"[WebChannel] publish_tts_audio failed: {e}")
            return ""
    @staticmethod
    def _cleanup_stale_voice_recordings(max_age_seconds: int = 3600) -> None:
-        """Delete voice-input audio files older than `max_age_seconds`.
+        """Drop voice_input_* uploads older than max_age_seconds (run at startup)."""
        Called once at startup. Web mic recordings live in the upload
        directory so the browser can replay them inside the conversation
        bubble. We don't persist them to history, so once a process
        restarts they're useless — but they're never auto-cleaned
        anywhere else, so without this they accumulate over time.
        """
        try:
            upload_dir = _get_upload_dir()
            if not os.path.isdir(upload_dir):
@@ -619,6 +743,10 @@ class WebChannel(ChatChannel):
            prompt = json_data.get('message', '')
            use_sse = json_data.get('stream', True)
            attachments = json_data.get('attachments', [])
            # Tag the message as originating from voice input so the post-reply
            # TTS hook can honour the `voice_if_voice` policy (mirrors the
            # desire_rtype concept used by other channels).
            is_voice_input = bool(json_data.get('is_voice', False))
            # Append file references to the prompt (same format as QQ channel)
            if attachments:
@@ -669,6 +797,11 @@ class WebChannel(ChatChannel):
            context["session_id"] = session_id
            context["receiver"] = session_id
            context["request_id"] = request_id
            if is_voice_input:
                # Web channel runs its own TTS post-pipeline via
                # _maybe_dispatch_auto_tts; don't set desire_rtype here or
                # chat_channel would synthesize a duplicate VOICE reply.
                context["is_voice_input"] = True
            if use_sse:
                context["on_event"] = self._make_sse_callback(request_id)
@@ -696,27 +829,39 @@ class WebChannel(ChatChannel):
        q = self.sse_queues[request_id]
        idle_timeout = 600  # 10 minutes without any real event
        deadline = time.time() + idle_timeout
-        done = False
+        # After the main reply is done we keep the stream open for a short
        # tail so async post-processing (TTS auto-synthesis) can deliver a
        # `voice_attach` event before the client disconnects.
        POST_DONE_TAIL_SECONDS = 60
        post_done = False
        post_deadline = 0.0
        try:
            while time.time() < deadline:
                try:
                    item = q.get(timeout=1)
                except Empty:
                    if post_done and time.time() >= post_deadline:
                        break
                    yield b": keepalive\n\n"
                    continue
                # Real event received, reset idle deadline
                deadline = time.time() + idle_timeout
                payload = json.dumps(item, ensure_ascii=False)
                yield f"data: {payload}\n\n".encode("utf-8")
-                if item.get("type") == "done":
+                itype = item.get("type")
-                    done = True
+                if itype == "done":
-                    break
+                    post_done = True
                    post_deadline = time.time() + POST_DONE_TAIL_SECONDS
                elif itype == "voice_attach":
                    # WSGI buffers the previous chunk until the next yield;
                    # shrink the tail so the generator wakes up quickly to
                    # emit a couple of keepalive comments that push the
                    # voice_attach payload through to the browser.
                    post_done = True
                    post_deadline = time.time() + 2  # 2s post-attach tail
        finally:
            if done:
            self.sse_queues.pop(request_id, None)
    def poll_response(self):
@@ -811,6 +956,7 @@ class WebChannel(ChatChannel):
            '/uploads/(.*)', 'UploadsHandler',
            '/api/file', 'FileServeHandler',
            '/api/voice/asr', 'VoiceAsrHandler',
            '/api/voice/tts', 'VoiceTtsHandler',
            '/poll', 'PollHandler',
            '/stream', 'StreamHandler',
            '/chat', 'ChatHandler',
@@ -936,15 +1082,8 @@ class UploadHandler:
 class VoiceAsrHandler:
-    """
+    """Receive a mic recording, persist it under uploads/ and run ASR.
-    Accept a short audio recording from the web console mic button,
+    Returns {status, text, audio_url} so the UI can render a playback bubble."""
    save it under uploads/ so the browser can replay it, then run it
    through the currently configured ASR provider.
    Returns {status, text, audio_url} on success — the frontend renders
    a voice-message bubble with the playable audio and the transcribed
    caption.
    """
    def POST(self):
        _require_auth()
        web.header('Content-Type', 'application/json; charset=utf-8')
@@ -997,6 +1136,48 @@ class VoiceAsrHandler:
            return json.dumps({"status": "error", "message": str(e)})
 class VoiceTtsHandler:
    """On-demand TTS for the in-chat "read aloud" button. Returns the
    audio URL and (when session_id is given) persists it onto the message."""
    def POST(self):
        _require_auth()
        web.header('Content-Type', 'application/json; charset=utf-8')
        try:
            data = json.loads(web.data() or b"{}")
            text = (data.get("text") or "").strip()
            session_id = (data.get("session_id") or "").strip()
            if not text:
                return json.dumps({"status": "error", "message": "empty text"})
            # `@singleton` makes WebChannel a factory function — go via instance.
            channel = WebChannel()
            if not channel._tts_provider_ready():
                return json.dumps({"status": "error", "message": "tts not configured"})
            from bridge.bridge import Bridge
            reply = Bridge().fetch_text_to_voice(text)
            if reply is None or reply.type != ReplyType.VOICE or not reply.content:
                msg = getattr(reply, "content", "") or "tts failed"
                return json.dumps({"status": "error", "message": str(msg)})
            url = channel._publish_tts_audio(reply.content)
            if not url:
                return json.dumps({"status": "error", "message": "publish failed"})
            if session_id:
                try:
                    from agent.memory import get_conversation_store
                    get_conversation_store().attach_extras_to_last_assistant(
                        session_id, {"audio": {"url": url, "kind": "tts"}},
                    )
                except Exception as e:
                    logger.debug(f"[VoiceTtsHandler] persist skipped: {e}")
            return json.dumps({"status": "success", "audio_url": url})
        except Exception as e:
            logger.exception(f"[VoiceTtsHandler] failed: {e}")
            return json.dumps({"status": "error", "message": str(e)})
 class UploadsHandler:
    def GET(self, file_name):
        _require_auth()
@@ -1357,10 +1538,243 @@ class ModelsHandler:
    POST /api/models/capability -> set provider/model for a capability
    """
-    # Capability -> editable flag, current-value resolver, and supported provider
+    # Capability -> provider ids drawn from ConfigHandler.PROVIDER_MODELS.
    # ids drawn from ConfigHandler.PROVIDER_MODELS where applicable.
    _ASR_PROVIDERS = ["openai", "dashscope", "zhipu", "linkai"]
-    _TTS_PROVIDERS = ["openai", "linkai", "minimax", "baidu", "ali", "xunfei", "azure", "google", "elevenlabs", "edge", "pytts"]
+    # Web-console white-list. Other vendors stay usable via direct config.
    _TTS_PROVIDERS = ["openai", "minimax", "dashscope", "linkai"]
    # TTS engine catalog (speech models, not voice timbres). Entries are
    # either a bare code or {value, hint?} when a friendly label helps.
    _TTS_PROVIDER_MODELS = {
        "openai":    ["tts-1", "tts-1-hd", "gpt-4o-mini-tts"],
        "minimax": [
            {"value": "speech-2.8-hd",    "hint": "情绪渲染融合语气词,自然听感"},
            {"value": "speech-2.8-turbo", "hint": "极致生成速度,更自然逼真"},
            {"value": "speech-2.6-hd",    "hint": "超低延时,归一化升级"},
            {"value": "speech-2.6-turbo", "hint": "更快更便宜,适合语音聊天/数字人"},
        ],
        "dashscope": [
            {"value": "qwen3-tts-flash", "hint": "覆盖普通话、方言与主流外语"},
        ],
        # Aggregating gateway: a single endpoint multiplexes several
        # underlying TTS engines, selected via the `model` field.
        # Each engine exposes its own voice catalog (see _TTS_PROVIDER_VOICES).
        "linkai": [
            {"value": "tts-1",  "hint": "OpenAI · 多语种通用"},
            {"value": "doubao", "hint": "字节豆包 · 中文音色丰富"},
            {"value": "baidu",  "hint": "百度 · 中文主播音色"},
        ],
    }
    # Per-provider voice timbres. Entries can be a bare code string
    # (label = code) or {value, hint?} when a friendly secondary label
    # helps recognition. We keep `value` as the raw API code so power
    # users can cross-reference config.json.
    _TTS_PROVIDER_VOICES = {
        "openai":    [
            "alloy", "echo", "fable", "onyx", "nova", "shimmer",
            "ash", "ballad", "coral", "sage", "verse",
        ],
        "minimax": [
            # Mandarin Chinese (full catalog)
            {"value": "male-qn-qingse",                           "hint": "中文 · 青涩青年（男）"},
            {"value": "male-qn-jingying",                         "hint": "中文 · 精英青年（男）"},
            {"value": "male-qn-badao",                            "hint": "中文 · 霸道青年（男）"},
            {"value": "male-qn-daxuesheng",                       "hint": "中文 · 青年大学生（男）"},
            {"value": "female-shaonv",                            "hint": "中文 · 少女（女）"},
            {"value": "female-yujie",                             "hint": "中文 · 御姐（女）"},
            {"value": "female-chengshu",                          "hint": "中文 · 成熟女性（女）"},
            {"value": "female-tianmei",                           "hint": "中文 · 甜美女性（女）"},
            {"value": "male-qn-qingse-jingpin",                   "hint": "中文 · 青涩青年-beta（男）"},
            {"value": "male-qn-jingying-jingpin",                 "hint": "中文 · 精英青年-beta（男）"},
            {"value": "male-qn-badao-jingpin",                    "hint": "中文 · 霸道青年-beta（男）"},
            {"value": "male-qn-daxuesheng-jingpin",               "hint": "中文 · 青年大学生-beta（男）"},
            {"value": "female-shaonv-jingpin",                    "hint": "中文 · 少女-beta（女）"},
            {"value": "female-yujie-jingpin",                     "hint": "中文 · 御姐-beta（女）"},
            {"value": "female-chengshu-jingpin",                  "hint": "中文 · 成熟女性-beta（女）"},
            {"value": "female-tianmei-jingpin",                   "hint": "中文 · 甜美女性-beta（女）"},
            {"value": "clever_boy",                               "hint": "中文 · 聪明男童"},
            {"value": "cute_boy",                                 "hint": "中文 · 可爱男童"},
            {"value": "lovely_girl",                              "hint": "中文 · 萌萌女童"},
            {"value": "cartoon_pig",                              "hint": "中文 · 卡通猪小琪"},
            {"value": "bingjiao_didi",                            "hint": "中文 · 病娇弟弟"},
            {"value": "junlang_nanyou",                           "hint": "中文 · 俊朗男友"},
            {"value": "chunzhen_xuedi",                           "hint": "中文 · 纯真学弟"},
            {"value": "lengdan_xiongzhang",                       "hint": "中文 · 冷淡学长"},
            {"value": "badao_shaoye",                             "hint": "中文 · 霸道少爷"},
            {"value": "tianxin_xiaoling",                         "hint": "中文 · 甜心小玲"},
            {"value": "qiaopi_mengmei",                           "hint": "中文 · 俏皮萌妹"},
            {"value": "wumei_yujie",                              "hint": "中文 · 妩媚御姐"},
            {"value": "diadia_xuemei",                            "hint": "中文 · 嗲嗲学妹"},
            {"value": "danya_xuejie",                             "hint": "中文 · 淡雅学姐"},
            {"value": "Chinese (Mandarin)_Reliable_Executive",    "hint": "中文 · 沉稳高管"},
            {"value": "Chinese (Mandarin)_News_Anchor",           "hint": "中文 · 新闻女声"},
            {"value": "Chinese (Mandarin)_Mature_Woman",          "hint": "中文 · 傲娇御姐"},
            {"value": "Chinese (Mandarin)_Unrestrained_Young_Man","hint": "中文 · 不羁青年"},
            {"value": "Arrogant_Miss",                            "hint": "中文 · 嚣张小姐"},
            {"value": "Robot_Armor",                              "hint": "中文 · 机械战甲"},
            {"value": "Chinese (Mandarin)_Kind-hearted_Antie",    "hint": "中文 · 热心大婶"},
            {"value": "Chinese (Mandarin)_HK_Flight_Attendant",   "hint": "中文 · 港普空姐"},
            {"value": "Chinese (Mandarin)_Humorous_Elder",        "hint": "中文 · 搞笑大爷"},
            {"value": "Chinese (Mandarin)_Gentleman",             "hint": "中文 · 温润男声"},
            {"value": "Chinese (Mandarin)_Warm_Bestie",           "hint": "中文 · 温暖闺蜜"},
            {"value": "Chinese (Mandarin)_Male_Announcer",        "hint": "中文 · 播报男声"},
            {"value": "Chinese (Mandarin)_Sweet_Lady",            "hint": "中文 · 甜美女声"},
            {"value": "Chinese (Mandarin)_Southern_Young_Man",    "hint": "中文 · 南方小哥"},
            {"value": "Chinese (Mandarin)_Wise_Women",            "hint": "中文 · 阅历姐姐"},
            {"value": "Chinese (Mandarin)_Gentle_Youth",          "hint": "中文 · 温润青年"},
            {"value": "Chinese (Mandarin)_Warm_Girl",             "hint": "中文 · 温暖少女"},
            {"value": "Chinese (Mandarin)_Kind-hearted_Elder",    "hint": "中文 · 花甲奶奶"},
            {"value": "Chinese (Mandarin)_Cute_Spirit",           "hint": "中文 · 憨憨萌兽"},
            {"value": "Chinese (Mandarin)_Radio_Host",            "hint": "中文 · 电台男主播"},
            {"value": "Chinese (Mandarin)_Lyrical_Voice",         "hint": "中文 · 抒情男声"},
            {"value": "Chinese (Mandarin)_Straightforward_Boy",   "hint": "中文 · 率真弟弟"},
            {"value": "Chinese (Mandarin)_Sincere_Adult",         "hint": "中文 · 真诚青年"},
            {"value": "Chinese (Mandarin)_Gentle_Senior",         "hint": "中文 · 温柔学姐"},
            {"value": "Chinese (Mandarin)_Stubborn_Friend",       "hint": "中文 · 嘴硬竹马"},
            {"value": "Chinese (Mandarin)_Crisp_Girl",            "hint": "中文 · 清脆少女"},
            {"value": "Chinese (Mandarin)_Pure-hearted_Boy",      "hint": "中文 · 清澈邻家弟弟"},
            {"value": "Chinese (Mandarin)_Soft_Girl",             "hint": "中文 · 柔和少女"},
            # Cantonese (full catalog)
            {"value": "Cantonese_ProfessionalHost（F)",            "hint": "粤语 · 专业女主持"},
            {"value": "Cantonese_GentleLady",                     "hint": "粤语 · 温柔女声"},
            {"value": "Cantonese_ProfessionalHost（M)",            "hint": "粤语 · 专业男主持"},
            {"value": "Cantonese_PlayfulMan",                     "hint": "粤语 · 活泼男声"},
            {"value": "Cantonese_CuteGirl",                       "hint": "粤语 · 可爱女孩"},
            {"value": "Cantonese_KindWoman",                      "hint": "粤语 · 善良女声"},
            # English (curated: 1F + 1M)
            {"value": "English_Graceful_Lady",                    "hint": "英文 · Graceful Lady（女）"},
            {"value": "English_Trustworthy_Man",                  "hint": "英文 · Trustworthy Man（男）"},
            # Japanese (curated: 1F + 1M)
            {"value": "Japanese_KindLady",                        "hint": "日文 · Kind Lady（女）"},
            {"value": "Japanese_LoyalKnight",                     "hint": "日文 · Loyal Knight（男）"},
            # Korean (curated: 1F + 1M)
            {"value": "Korean_SweetGirl",                         "hint": "韩文 · Sweet Girl（女）"},
            {"value": "Korean_CheerfulBoyfriend",                 "hint": "韩文 · Cheerful Boyfriend（男）"},
        ],
        "dashscope": [
            {"value": "Cherry",   "hint": "芊悦 · 阳光女声"},
            {"value": "Serena",   "hint": "苏瑶 · 温柔女声"},
            {"value": "Chelsie",  "hint": "千雪 · 二次元少女"},
            {"value": "Ethan",    "hint": "晨煦 · 阳光男声"},
            {"value": "Moon",     "hint": "月白 · 率性男声"},
            {"value": "Kai",      "hint": "凯 · 治愈男声"},
            {"value": "Nofish",   "hint": "不吃鱼 · 设计师男声"},
            {"value": "Bella",    "hint": "萌宝 · 小萝莉"},
            {"value": "Bunny",    "hint": "萌小姬 · 萌系少女"},
            {"value": "Stella",   "hint": "少女阿月 · 元气少女"},
            {"value": "Neil",     "hint": "阿闻 · 新闻主播"},
            {"value": "Seren",    "hint": "小婉 · 助眠女声"},
            {"value": "Jada",     "hint": "上海话 · 阿珍"},
            {"value": "Dylan",    "hint": "北京话 · 晓东"},
            {"value": "Sunny",    "hint": "四川话 · 晴儿"},
            {"value": "Eric",     "hint": "四川话 · 程川"},
            {"value": "Rocky",    "hint": "粤语 · 阿强"},
            {"value": "Kiki",     "hint": "粤语 · 阿清"},
            {"value": "Peter",    "hint": "天津话 · 李彼得"},
            {"value": "Marcus",   "hint": "陕西话 · 秦川"},
            {"value": "Roy",      "hint": "闽南语 · 阿杰"},
        ],
        # Aggregating gateway: voices are scoped per engine model. The
        # frontend picks the correct list based on the selected model so
        # users don't see incompatible timbres for the active engine.
        "linkai": {
            "tts-1": [
                "alloy", "echo", "fable", "onyx", "nova", "shimmer",
            ],
            "doubao": [
                {"value": "zh_female_wanwanxiaohe_moon_bigtts",       "hint": "湾湾小何"},
                {"value": "BV007_streaming",                          "hint": "亲切女声"},
                {"value": "BV001_streaming",                          "hint": "通用女声"},
                {"value": "BV002_streaming",                          "hint": "通用男声"},
                {"value": "BV051_streaming",                          "hint": "奶气萌娃"},
                {"value": "zh_female_linjianvhai_moon_bigtts",        "hint": "邻家女孩"},
                {"value": "BV700_streaming",                          "hint": "灿灿"},
                {"value": "BV019_streaming",                          "hint": "重庆小伙"},
                {"value": "BV524_streaming",                          "hint": "日语男声"},
                {"value": "BV021_streaming",                          "hint": "东北老铁"},
                {"value": "BV701_streaming",                          "hint": "擎苍"},
                {"value": "BV113_streaming",                          "hint": "甜宠少御"},
                {"value": "BV056_streaming",                          "hint": "阳光男声"},
                {"value": "BV213_streaming",                          "hint": "广西表哥"},
                {"value": "BV119_streaming",                          "hint": "通用赘婿"},
                {"value": "BV705_streaming",                          "hint": "炀炀"},
                {"value": "BV033_streaming",                          "hint": "温柔小哥"},
                {"value": "BV102_streaming",                          "hint": "儒雅青年"},
                {"value": "BV522_streaming",                          "hint": "气质女生"},
                {"value": "BV034_streaming",                          "hint": "知性姐姐 · 双语"},
                {"value": "BV005_streaming",                          "hint": "活泼女声"},
                {"value": "zh_female_wanqudashu_moon_bigtts",         "hint": "湾区大叔"},
                {"value": "zh_female_daimengchuanmei_moon_bigtts",    "hint": "呆萌川妹"},
                {"value": "zh_male_guozhoudege_moon_bigtts",          "hint": "广州德哥"},
                {"value": "zh_male_beijingxiaoye_moon_bigtts",        "hint": "北京小爷"},
                {"value": "zh_male_shaonianzixin_moon_bigtts",        "hint": "少年梓辛 / Brayan"},
                {"value": "zh_female_meilinvyou_moon_bigtts",         "hint": "魅力女友"},
                {"value": "zh_male_shenyeboke_moon_bigtts",           "hint": "深夜播客"},
                {"value": "zh_female_sajiaonvyou_moon_bigtts",        "hint": "柔美女友"},
                {"value": "zh_female_yuanqinvyou_moon_bigtts",        "hint": "撒娇学妹"},
                {"value": "zh_male_haoyuxiaoge_moon_bigtts",          "hint": "浩宇小哥"},
                {"value": "zh_male_guangxiyuanzhou_moon_bigtts",      "hint": "广西远舟"},
                {"value": "zh_female_meituojieer_moon_bigtts",        "hint": "妹坨洁儿"},
                {"value": "zh_male_yuzhouzixuan_moon_bigtts",         "hint": "豫州子轩"},
                {"value": "BV115_streaming",                          "hint": "古风少御"},
                {"value": "zh_female_gaolengyujie_moon_bigtts",       "hint": "高冷御姐"},
                {"value": "zh_male_yuanboxiaoshu_moon_bigtts",        "hint": "渊博小叔"},
                {"value": "zh_male_yangguangqingnian_moon_bigtts",    "hint": "阳光青年"},
                {"value": "zh_male_aojiaobazong_moon_bigtts",         "hint": "傲娇霸总"},
                {"value": "zh_male_jingqiangkanye_moon_bigtts",       "hint": "京腔侃爷 / Harmony"},
                {"value": "zh_female_shuangkuaisisi_moon_bigtts",     "hint": "爽快思思 / Skye"},
                {"value": "zh_male_wennuanahu_moon_bigtts",           "hint": "温暖阿虎 / Alvin"},
                {"value": "multi_female_shuangkuaisisi_moon_bigtts",  "hint": "はるこ / Esmeralda"},
                {"value": "multi_male_jingqiangkanye_moon_bigtts",    "hint": "かずね / Javier or Álvaro"},
                {"value": "multi_female_gaolengyujie_moon_bigtts",    "hint": "あけみ"},
                {"value": "multi_male_wanqudashu_moon_bigtts",        "hint": "ひろし / Roberto"},
                {"value": "ICL_zh_female_bingruoshaonv_tob",          "hint": "病弱少女"},
                {"value": "ICL_zh_female_huoponvhai_tob",             "hint": "活泼女孩"},
                {"value": "ICL_zh_female_heainainai_tob",             "hint": "和蔼奶奶"},
                {"value": "ICL_zh_female_linjuayi_tob",               "hint": "邻居阿姨"},
                {"value": "zh_female_wenrouxiaoya_moon_bigtts",       "hint": "温柔小雅"},
                {"value": "zh_female_tianmeixiaoyuan_moon_bigtts",    "hint": "甜美小源"},
                {"value": "zh_female_qingchezizi_moon_bigtts",        "hint": "清澈梓梓"},
                {"value": "zh_male_dongfanghaoran_moon_bigtts",       "hint": "东方浩然"},
                {"value": "zh_male_jieshuoxiaoming_moon_bigtts",      "hint": "解说小明"},
                {"value": "zh_female_kailangjiejie_moon_bigtts",      "hint": "开朗姐姐"},
                {"value": "zh_male_linjiananhai_moon_bigtts",         "hint": "邻家男孩"},
                {"value": "zh_female_tianmeiyueyue_moon_bigtts",      "hint": "甜美悦悦"},
                {"value": "zh_female_xinlingjitang_moon_bigtts",      "hint": "心灵鸡汤"},
            ],
            "baidu": [
                {"value": "baidu_0",    "hint": "度小美 · 标准女主播"},
                {"value": "baidu_1",    "hint": "度小宇 · 亲切男声"},
                {"value": "baidu_3",    "hint": "度逍遥 · 情感男声"},
                {"value": "baidu_4",    "hint": "度丫丫 · 童声"},
                {"value": "baidu_5",    "hint": "度小娇 · 成熟女主播"},
                {"value": "baidu_5003", "hint": "度逍遥 · 情感男声"},
                {"value": "baidu_5118", "hint": "度小鹿 · 甜美女声"},
                {"value": "baidu_103",  "hint": "度米朵 · 可爱童声"},
                {"value": "baidu_106",  "hint": "度博文 · 专业男主播"},
                {"value": "baidu_110",  "hint": "度小童 · 童声主播"},
                {"value": "baidu_111",  "hint": "度小萌 · 软萌妹子"},
                {"value": "baidu_4003", "hint": "度逍遥 · 情感男声"},
                {"value": "baidu_4100", "hint": "度小雯 · 活力女主播"},
                {"value": "baidu_4103", "hint": "度米朵 · 可爱女声"},
                {"value": "baidu_4105", "hint": "度灵儿 · 清澈女声"},
                {"value": "baidu_4106", "hint": "度博文 · 专业男主播"},
                {"value": "baidu_4115", "hint": "度小贤 · 电台男主播"},
                {"value": "baidu_4117", "hint": "度小乔 · 活泼女声"},
                {"value": "baidu_4119", "hint": "度小鹿 · 甜美女声"},
                {"value": "baidu_4129", "hint": "度小彦 · 知识男主播"},
                {"value": "baidu_4140", "hint": "度小新 · 专业女主播"},
                {"value": "baidu_4143", "hint": "度清风 · 配音男声"},
                {"value": "baidu_4144", "hint": "度姗姗 · 娱乐女声"},
                {"value": "baidu_4149", "hint": "度星河 · 广告男声"},
                {"value": "baidu_4206", "hint": "度博文 · 综艺男声"},
                {"value": "baidu_4226", "hint": "南方 · 电台女主播"},
                {"value": "baidu_4254", "hint": "度小清 · 广告女声"},
                {"value": "baidu_4278", "hint": "度小贝 · 知识女主播"},
            ],
        },
    }
    _EMBEDDING_PROVIDERS = ["openai", "dashscope", "doubao", "zhipu", "linkai"]
    # Capability-scoped model catalogs. The chat dropdown can reuse the
@@ -1525,7 +1939,7 @@ class ModelsHandler:
    @classmethod
    def _predict_vision_auto(cls, local_config: dict) -> dict:
        """Predict which provider vision.py will actually dispatch to when
-        no tool.vision.model is set. Mirrors the fallback order in
+        no tools.vision.model is set. Mirrors the fallback order in
        agent/tools/vision/vision.py::_resolve_providers so the UI hint
        matches reality."""
        chat = cls._chat_capability(local_config)
@@ -1590,12 +2004,12 @@ class ModelsHandler:
    @classmethod
    def _vision_capability(cls, local_config: dict) -> dict:
-        """Vision model. tool.vision.model is the explicit override; otherwise
+        """Vision model. tools.vision.model is the explicit override; otherwise
        the runtime fallback chain in agent/tools/vision/vision.py decides."""
-        tool_conf = local_config.get("tool") or {}
+        tools_conf = local_config.get("tools") or local_config.get("tool") or {}
-        if not isinstance(tool_conf, dict):
+        if not isinstance(tools_conf, dict):
-            tool_conf = {}
+            tools_conf = {}
-        vision_conf = tool_conf.get("vision") or {}
+        vision_conf = tools_conf.get("vision") or {}
        if not isinstance(vision_conf, dict):
            vision_conf = {}
        user_specified = (vision_conf.get("model") or "").strip()
@@ -1652,14 +2066,38 @@ class ModelsHandler:
    @classmethod
    def _tts_capability(cls, local_config: dict) -> dict:
-        provider_id = (local_config.get("text_to_voice") or "openai").strip().lower()
+        explicit = (local_config.get("text_to_voice") or "").strip().lower()
        # Providers outside the white-list don't drive the picker, but their
        # underlying runtime config is preserved so bridge still routes them.
        ui_provider = explicit if explicit in cls._TTS_PROVIDERS else ""
        suggested = ""
        if not ui_provider:
            for pid in cls._TTS_PROVIDERS:
                meta = ConfigHandler.PROVIDER_MODELS.get(pid) or {}
                key_field = meta.get("api_key_field")
                if key_field and cls._is_real_key(local_config.get(key_field, "")):
                    suggested = pid
                    break
        return {
            "editable": True,
-            "current_provider": provider_id,
+            "current_provider": ui_provider,
-            "current_model": local_config.get("text_to_voice_model", "") or "",
+            "suggested_provider": suggested,
            "current_model": (local_config.get("text_to_voice_model") or "") if ui_provider else "",
            "current_voice": (local_config.get("tts_voice_id") or "") if ui_provider else "",
            "providers": cls._TTS_PROVIDERS,
            "provider_models": cls._TTS_PROVIDER_MODELS,
            "provider_voices": cls._TTS_PROVIDER_VOICES,
            "reply_mode": cls._tts_reply_mode(local_config),
        }
    @staticmethod
    def _tts_reply_mode(local_config: dict) -> str:
        if local_config.get("always_reply_voice", False):
            return "always"
        if local_config.get("voice_reply_voice", False):
            return "voice_if_voice"
        return "off"
    @classmethod
    def _embedding_capability(cls, local_config: dict) -> dict:
        # Embedding is "pick or empty" — runtime's legacy openai/linkai
@@ -1728,17 +2166,20 @@ class ModelsHandler:
    @classmethod
    def _image_capability(cls, local_config: dict) -> dict:
-        """Image generation. Source of truth: config["skill"]["image-generation"]["model"]
+        """Image generation. Source of truth: config["skills"]["image-generation"]["model"]
        (mirrors the per-skill config schema documented in skills/image-generation).
        The runtime resolver in skills/image-generation/scripts/generate.py
        reads this via the SKILL_IMAGE_GENERATION_MODEL env var that the
        agent_initializer syncs at startup; provider is inferred from the
        model name prefix, mirroring vision.py's design.
        ``skill`` (singular) is still tolerated as a legacy fallback —
        config.load_config() folds it into ``skills`` at startup.
        """
-        skill_node = local_config.get("skill") or {}
+        skills_node = local_config.get("skills") or local_config.get("skill") or {}
-        if not isinstance(skill_node, dict):
+        if not isinstance(skills_node, dict):
-            skill_node = {}
+            skills_node = {}
-        img_node = skill_node.get("image-generation") or {}
+        img_node = skills_node.get("image-generation") or {}
        if not isinstance(img_node, dict):
            img_node = {}
        explicit_model = (img_node.get("model") or "").strip()
@@ -1832,6 +2273,8 @@ class ModelsHandler:
                return self._handle_delete_provider(data)
            if action == "set_capability":
                return self._handle_set_capability(data)
            if action == "set_voice_reply_mode":
                return self._handle_set_voice_reply_mode(data)
            return json.dumps({"status": "error", "message": f"unknown action: {action!r}"})
        except Exception as e:
            logger.error(f"[ModelsHandler] POST failed: {e}")
@@ -1918,7 +2361,7 @@ class ModelsHandler:
        if capability == "asr":
            return self._set_simple("voice_to_text", provider_id)
        if capability == "tts":
-            return self._set_tts(provider_id, model)
+            return self._set_tts(provider_id, model, (data.get("voice") or "").strip())
        if capability == "embedding":
            return self._set_embedding(provider_id, model)
        if capability == "image":
@@ -1926,35 +2369,20 @@ class ModelsHandler:
        return json.dumps({"status": "error", "message": f"capability not editable: {capability}"})
    def _set_image(self, provider_id: str, model: str) -> str:
-        # Source of truth: config["skill"]["image-generation"]["model"].
+        # Source of truth: skills.image-generation.model. provider_id is
-        # provider_id is informational only (used by the UI to highlight a
+        # informational only; the resolver picks the vendor by model prefix.
        # vendor card); the runtime resolver infers the provider from the
        # model name prefix at request time, mirroring vision.py's design.
        # An empty model means "switch back to auto / let the script pick".
        local_config = conf()
        file_cfg = self._read_file_config()
-        def _ensure_skill_node(cfg: dict) -> dict:
+        self._set_nested_namespace_value(local_config, "skills", "image-generation", "model", model or "")
-            skill_node = cfg.get("skill") or {}
+        self._set_nested_namespace_value(file_cfg, "skills", "image-generation", "model", model or "")
-            if not isinstance(skill_node, dict):
+        self._drop_legacy_namespace(local_config, "skill", "skills", child="image-generation")
-                skill_node = {}
+        self._drop_legacy_namespace(file_cfg, "skill", "skills", child="image-generation")
            img_node = skill_node.get("image-generation") or {}
            if not isinstance(img_node, dict):
                img_node = {}
            skill_node["image-generation"] = img_node
            cfg["skill"] = skill_node
            return img_node
        _ensure_skill_node(local_config)["model"] = model or ""
        _ensure_skill_node(file_cfg)["model"] = model or ""
        self._write_file_config(file_cfg)
-        # The skill subprocess (skills/image-generation/scripts/generate.py)
+        # The skill subprocess reads SKILL_IMAGE_GENERATION_MODEL from env at
-        # reads SKILL_IMAGE_GENERATION_MODEL from its environment, which is
+        # startup; mirror the change so live edits apply without restart.
        # only synced from config["skill"] at startup. Update os.environ live
        # so changes take effect on the next call without a restart. An empty
        # model means "clear the override" → drop the env var entirely.
        env_key = "SKILL_IMAGE_GENERATION_MODEL"
        if model:
            os.environ[env_key] = model
@@ -1992,8 +2420,6 @@ class ModelsHandler:
            applied["model"] = model
        if not applied:
            # No-op save (nothing to write). Return success so the UI can
            # confirm the click without showing a misleading error.
            return json.dumps({"status": "success", "applied": {}, "noop": True})
        self._write_file_config(file_cfg)
@@ -2002,34 +2428,66 @@ class ModelsHandler:
        return json.dumps({"status": "success", "applied": applied})
    def _set_vision(self, provider_id: str, model: str) -> str:
-        # Vision uses tool.vision.model (nested). provider_id is informational
+        # Source of truth: tools.vision.model. provider_id is informational
-        # only; the runtime resolver auto-routes by model name prefix.
+        # only; the resolver picks the vendor by model prefix.
        local_config = conf()
        file_cfg = self._read_file_config()
-        tool_node = file_cfg.get("tool") or {}
+        self._set_nested_namespace_value(file_cfg, "tools", "vision", "model", model)
-        if not isinstance(tool_node, dict):
+        self._set_nested_namespace_value(local_config, "tools", "vision", "model", model)
-            tool_node = {}
+        self._drop_legacy_namespace(file_cfg, "tool", "tools", child="vision")
-        vision_node = tool_node.get("vision") or {}
+        self._drop_legacy_namespace(local_config, "tool", "tools", child="vision")
        if not isinstance(vision_node, dict):
            vision_node = {}
        vision_node["model"] = model
        tool_node["vision"] = vision_node
        file_cfg["tool"] = tool_node
        # Mirror into in-memory config so the live agent sees the change.
        runtime_tool = local_config.get("tool") or {}
        if not isinstance(runtime_tool, dict):
            runtime_tool = {}
        runtime_vision = runtime_tool.get("vision") or {}
        if not isinstance(runtime_vision, dict):
            runtime_vision = {}
        runtime_vision["model"] = model
        runtime_tool["vision"] = runtime_vision
        local_config["tool"] = runtime_tool
        self._write_file_config(file_cfg)
        logger.info(f"[ModelsHandler] vision model set: {model!r}")
        return json.dumps({"status": "success", "model": model})
    @staticmethod
    def _set_nested_namespace_value(cfg, top: str, name: str, key: str, value):
        """Set ``cfg[top][name][key] = value``, creating missing dicts."""
        bucket = cfg.get(top)
        if not isinstance(bucket, dict):
            bucket = {}
        node = bucket.get(name)
        if not isinstance(node, dict):
            node = {}
        node[key] = value
        bucket[name] = node
        cfg[top] = bucket
    @staticmethod
    def _drop_legacy_namespace(cfg, legacy: str, canonical: str, child: str) -> None:
        """Strip the deprecated singular key so config.json stays single-source."""
        legacy_section = cfg.get(legacy)
        if not isinstance(legacy_section, dict):
            return
        legacy_section.pop(child, None)
        if legacy_section:
            cfg[legacy] = legacy_section
        else:
            cfg.pop(legacy, None)
    def _handle_set_voice_reply_mode(self, data: dict) -> str:
        # UI picker (off / voice_if_voice / always) maps to the legacy
        # always_reply_voice + voice_reply_voice pair that chat_channel.py
        # reads, so all channels (web/feishu/wecom/...) share the routing.
        mode = (data.get("mode") or "").strip().lower()
        if mode not in ("off", "voice_if_voice", "always"):
            return json.dumps({"status": "error", "message": f"invalid mode: {mode!r}"})
        always = (mode == "always")
        if_voice = (mode == "voice_if_voice")
        local_config = conf()
        file_cfg = self._read_file_config()
        local_config["always_reply_voice"] = always
        local_config["voice_reply_voice"] = if_voice
        file_cfg["always_reply_voice"] = always
        file_cfg["voice_reply_voice"] = if_voice
        self._write_file_config(file_cfg)
        logger.info(
            f"[ModelsHandler] voice reply mode set: {mode!r} "
            f"(always_reply_voice={always}, voice_reply_voice={if_voice})"
        )
        return json.dumps({"status": "success", "mode": mode})
    def _set_simple(self, key: str, value: str) -> str:
        local_config = conf()
        file_cfg = self._read_file_config()
@@ -2037,25 +2495,30 @@ class ModelsHandler:
        file_cfg[key] = value
        self._write_file_config(file_cfg)
        logger.info(f"[ModelsHandler] {key} set: {value!r}")
-        # Bridge caches voice_to_text routing + bot instance; refresh it
+        # Hot-swap the cached voice bot so the change takes effect immediately.
        # so the change takes effect on the next voice request.
        if key in ("voice_to_text", "text_to_voice"):
            self._refresh_voice_routing()
        return json.dumps({"status": "success", key: value})
-    def _set_tts(self, provider_id: str, model: str) -> str:
+    def _set_tts(self, provider_id: str, model: str, voice: str = "") -> str:
        local_config = conf()
        file_cfg = self._read_file_config()
        if provider_id:
        local_config["text_to_voice"] = provider_id
        file_cfg["text_to_voice"] = provider_id
        if model:
        local_config["text_to_voice_model"] = model
        file_cfg["text_to_voice_model"] = model
        local_config["tts_voice_id"] = voice
        file_cfg["tts_voice_id"] = voice
        self._write_file_config(file_cfg)
-        logger.info(f"[ModelsHandler] tts updated: provider={provider_id!r} model={model!r}")
+        logger.info(
            f"[ModelsHandler] tts updated: provider={provider_id!r} "
            f"model={model!r} voice={voice!r}"
        )
        self._refresh_voice_routing()
-        return json.dumps({"status": "success", "provider": provider_id, "model": model})
+        return json.dumps({
            "status": "success",
            "provider": provider_id, "model": model, "voice": voice,
        })
    @staticmethod
    def _refresh_voice_routing() -> None:
@@ -2066,17 +2529,20 @@ class ModelsHandler:
            logger.warning(f"[ModelsHandler] Bridge voice refresh failed: {e}")
    def _set_embedding(self, provider_id: str, model: str) -> str:
-        # provider_id="" + model="" means "switch back to legacy auto mode".
+        # Two valid states: both empty (reset to pick-or-empty) OR both set.
        # A provider without a model leaves the runtime in a broken half-state,
        # so reject that explicitly instead of silently writing it through.
        if provider_id and not model:
            return json.dumps({
                "status": "error",
                "message": "embedding model is required when a provider is selected",
            })
        local_config = conf()
        file_cfg = self._read_file_config()
        local_config["embedding_provider"] = provider_id
        file_cfg["embedding_provider"] = provider_id
        if model:
        local_config["embedding_model"] = model
        file_cfg["embedding_model"] = model
        else:
            local_config["embedding_model"] = ""
            file_cfg["embedding_model"] = ""
        self._write_file_config(file_cfg)
        logger.info(f"[ModelsHandler] embedding updated: provider={provider_id!r} model={model!r}")
        # The next /memory rebuild-index command hot-swaps the provider onto
--- a/config-template.json
+++ b/config-template.json
@@ -16,8 +16,8 @@
  "open_ai_api_base": "https://api.openai.com/v1",
  "gemini_api_key": "",
  "gemini_api_base": "https://generativelanguage.googleapis.com",
-  "voice_to_text": "openai",
+  "voice_to_text": "",
-  "text_to_voice": "openai",
+  "text_to_voice": "",
  "voice_reply_voice": false,
  "speech_recognition": true,
  "group_speech_recognition": false,
--- a/config.py
+++ b/config.py
@@ -330,8 +330,18 @@ def load_config():
    config_str = read_file(config_path)
    logger.debug("[INIT] config str: {}".format(drag_sensitive(config_str)))
-    # 将json字符串反序列化为dict类型
+    # 将json字符串反序列化为dict类型。
-    config = Config(json.loads(config_str))
+    # `object_pairs_hook` lets us catch users who accidentally typed the
    # same key twice (e.g. two `"tools"` blocks) — json.loads would
    # otherwise silently drop all but the last occurrence.
    config = Config(json.loads(config_str, object_pairs_hook=_merge_duplicate_keys))
    # Migrate legacy singular keys (`tool`, `skill`) into the canonical
    # plural buckets so the rest of the codebase only reads one schema.
    # Deep-merge so existing `tools`/`skills` entries are preserved and
    # only missing namespaces are filled in from the legacy section.
    _merge_legacy_namespace(config, legacy="tool",  canonical="tools")
    _merge_legacy_namespace(config, legacy="skill", canonical="skills")
    # override config with environment variables.
    # Some online deployment platforms (e.g. Railway) deploy project from github directly. So you shouldn't put your secrets like api key in a config file, instead use environment variables to override the default config.
@@ -422,7 +432,7 @@ def load_config():
                os.environ[env_key] = str(val)
                injected += 1
-    injected += _sync_skill_config_to_env(config.get("skill", {}))
+    injected += _sync_skill_config_to_env(config.get("skills", {}))
    if injected:
        logger.info("[INIT] Synced {} config values to environment variables".format(injected))
@@ -430,11 +440,90 @@ def load_config():
    config.load_user_datas()
 def _deep_merge_dicts(base: dict, incoming: dict) -> dict:
    """Recursively merge ``incoming`` into ``base`` (incoming wins on leaves)."""
    for key, val in incoming.items():
        if (
            key in base
            and isinstance(base[key], dict)
            and isinstance(val, dict)
        ):
            _deep_merge_dicts(base[key], val)
        else:
            base[key] = val
    return base
 def _merge_duplicate_keys(pairs):
    """object_pairs_hook for json.loads: deep-merge duplicate top-level keys
    (lists concat, dicts merge, scalars take the latter) instead of dropping."""
    out = {}
    duplicates = []
    for key, val in pairs:
        if key not in out:
            out[key] = val
            continue
        duplicates.append(key)
        prev = out[key]
        if isinstance(prev, dict) and isinstance(val, dict):
            _deep_merge_dicts(prev, val)
        elif isinstance(prev, list) and isinstance(val, list):
            prev.extend(val)
        else:
            out[key] = val
    if duplicates:
        # logger may not be wired yet — fall back to print so we never lose the warning.
        unique = sorted(set(duplicates))
        try:
            logger.warning("[INIT] config.json has duplicate keys (merged): %s", unique)
        except Exception:
            print("[INIT] config.json has duplicate keys (merged):", unique)
    return out
 def _merge_legacy_namespace(cfg, legacy: str, canonical: str) -> None:
    """Fold deprecated singular keys (``tool`` / ``skill``) into their plural
    canonical counterparts at load time. Canonical entries always win."""
    legacy_section = cfg.get(legacy)
    if not isinstance(legacy_section, dict) or not legacy_section:
        cfg.pop(legacy, None)
        return
    canonical_section = cfg.get(canonical)
    if not isinstance(canonical_section, dict):
        canonical_section = {}
    merged_keys = []
    for name, val in legacy_section.items():
        if name in canonical_section:
            if isinstance(canonical_section[name], dict) and isinstance(val, dict):
                for sub_key, sub_val in val.items():
                    if (
                        sub_key in canonical_section[name]
                        and isinstance(canonical_section[name][sub_key], dict)
                        and isinstance(sub_val, dict)
                    ):
                        _deep_merge_dicts(sub_val, canonical_section[name][sub_key])
                        canonical_section[name][sub_key] = sub_val
                    else:
                        canonical_section[name].setdefault(sub_key, sub_val)
            continue
        canonical_section[name] = val
        merged_keys.append(name)
    cfg[canonical] = canonical_section
    cfg.pop(legacy, None)
    if merged_keys:
        logger.warning(
            "[INIT] Legacy config key '{}' is deprecated; merged into '{}': {}. "
            "Please rename '{}' to '{}' in your config.json.".format(
                legacy, canonical, merged_keys, legacy, canonical,
            )
        )
 def _sync_skill_config_to_env(skill_section) -> int:
    """Flatten skill-namespaced config into environment variables.
-    Mapping rule: ``config["skill"][<name>][<key>]`` -> ``SKILL_<NAME>_<KEY>``
+    Mapping rule: ``config["skills"][<name>][<key>]`` -> ``SKILL_<NAME>_<KEY>``
-    (e.g. ``skill["image-generation"].model`` -> ``SKILL_IMAGE_GENERATION_MODEL``).
+    (e.g. ``skills["image-generation"].model`` -> ``SKILL_IMAGE_GENERATION_MODEL``).
    This lets subprocess-based skill scripts read their own settings without
    importing project code. Existing env vars are NOT overwritten so the
--- a/docs/en/models/qianfan.mdx
+++ b/docs/en/models/qianfan.mdx
@@ -40,7 +40,7 @@ To force a specific Vision model, set it explicitly in `config.json`:
 ```json
 {
-  "tool": {
+  "tools": {
    "vision": {
      "model": "ernie-4.5-turbo-vl"
    }
--- a/docs/en/releases/v2.0.7.mdx
+++ b/docs/en/releases/v2.0.7.mdx
@@ -11,7 +11,7 @@ New built-in `image-generation` skill supporting text-to-image, image-to-image,
 - **Zero model selection**: Just configure an API key and it works — no need to manually specify a model. You can also name a specific model in conversation (e.g. "draw a cat with seedream")
 - **Flexible control**: Supports `quality`, `size` (512/1K–4K), and `aspect_ratio` parameters, with each provider automatically mapping to its supported values
 - **Image editing**: Pass existing images for editing, style transfer, or multi-image fusion (Seedream supports up to 14 reference images)
- **Skill-level config**: Pin a default model via `skill.image-generation.model` in `config.json`
+- **Skill-level config**: Pin a default model via `skills.image-generation.model` in `config.json`
 - **Image lightbox**: All images in the Web console now support click-to-enlarge preview
 Docs: [Image Generation Skill](https://docs.cowagent.ai/en/skills/image-generation)
--- a/docs/en/releases/v2.0.8.mdx
+++ b/docs/en/releases/v2.0.8.mdx
@@ -51,7 +51,7 @@ The voice and streaming building blocks come from a community contribution #2791
 ## 🔧 Tools and Safety
- **Vision model selection**: `tool.vision.model` config now actually takes effect, with automatic fallback when unconfigured #2792
+- **Vision model selection**: `tools.vision.model` config now actually takes effect, with automatic fallback when unconfigured #2792
 - **Bash safety prompt**: The destructive-deletion confirm prompt is now scoped to paths outside the workspace — routine in-workspace operations are no longer interrupted
 ## 🐛 Other Fixes
--- a/docs/en/skills/image-generation.mdx
+++ b/docs/en/skills/image-generation.mdx
@@ -87,7 +87,7 @@ Configure ARK_API_KEY as xxx
 To force all image generation through a specific provider's model, add this to `config.json`:
 ```json
-"skill": {
+"skills": {
  "image-generation": {
    "model": "seedream-5.0-lite"
  }
--- a/docs/en/tools/vision.mdx
+++ b/docs/en/tools/vision.mdx
@@ -51,7 +51,7 @@ To specify a particular model for the vision tool, add to `config.json`:
 ```json
 {
-    "tool": {
+    "tools": {
        "vision": {
            "model": "ernie-4.5-turbo-vl"
        }
--- a/docs/ja/models/qianfan.mdx
+++ b/docs/ja/models/qianfan.mdx
@@ -40,7 +40,7 @@ description: Baidu Qianfan ERNIE モデル設定
 ```json
 {
-  "tool": {
+  "tools": {
    "vision": {
      "model": "ernie-4.5-turbo-vl"
    }
--- a/docs/ja/releases/v2.0.7.mdx
+++ b/docs/ja/releases/v2.0.7.mdx
@@ -11,7 +11,7 @@ description: CowAgent 2.0.7 - 画像生成スキル（6プロバイダー自動
 - **モデル選択不要**：API Key を設定するだけで使用可能、モデルを手動で指定する必要なし。会話で特定モデルを指名することも可能（例：「seedream で猫を描いて」）
 - **柔軟な制御**：`quality`（画質）、`size`（解像度、512/1K〜4K）、`aspect_ratio`（アスペクト比）パラメータ対応、各プロバイダーが自動的に有効な値にマッピング
 - **画像編集**：既存の画像を渡して編集・スタイル変換・複数画像融合が可能（Seedream は最大 14 枚の参照画像をサポート）
- **スキルレベル設定**：`config.json` の `skill.image-generation.model` でデフォルトモデルを固定可能
+- **スキルレベル設定**：`config.json` の `skills.image-generation.model` でデフォルトモデルを固定可能
 - **画像ライトボックス**：Web コンソールのすべての画像がクリックで拡大プレビュー対応
 ドキュメント：[画像生成スキル](https://docs.cowagent.ai/ja/skills/image-generation)
--- a/docs/ja/releases/v2.0.8.mdx
+++ b/docs/ja/releases/v2.0.8.mdx
@@ -51,7 +51,7 @@ description: CowAgent 2.0.8 - 飛書チャネル全面アップグレード（
 ## 🔧 ツールと安全性
- **Vision モデル選択**：`tool.vision.model` 設定が実際に反映されるようになり、未設定時は自動フォールバック #2792
+- **Vision モデル選択**：`tools.vision.model` 設定が実際に反映されるようになり、未設定時は自動フォールバック #2792
 - **Bash セーフティ確認**：破壊的削除の確認プロンプトをワークスペース外のパスに限定。ワークスペース内の通常操作は中断されません
 ## 🐛 その他の修正
--- a/docs/ja/skills/image-generation.mdx
+++ b/docs/ja/skills/image-generation.mdx
@@ -87,7 +87,7 @@ ARK_API_KEY を xxx に設定して
 すべての画像生成を特定のプロバイダーのモデルで固定したい場合、`config.json` に以下を追加：
 ```json
-"skill": {
+"skills": {
  "image-generation": {
    "model": "seedream-5.0-lite"
  }
--- a/docs/ja/tools/vision.mdx
+++ b/docs/ja/tools/vision.mdx
@@ -51,7 +51,7 @@ Vision ツールで使用するモデルを指定するには、`config.json`
 ```json
 {
-    "tool": {
+    "tools": {
        "vision": {
            "model": "ernie-4.5-turbo-vl"
        }
--- a/docs/models/qianfan.mdx
+++ b/docs/models/qianfan.mdx
@@ -40,7 +40,7 @@ description: 百度千帆 ERNIE 模型配置
 ```json
 {
-  "tool": {
+  "tools": {
    "vision": {
      "model": "ernie-4.5-turbo-vl"
    }
--- a/docs/releases/v2.0.7.mdx
+++ b/docs/releases/v2.0.7.mdx
@@ -11,7 +11,7 @@ description: CowAgent 2.0.7 - 图像生成技能（六厂商自动路由）、
 - **开箱即用**：配置 API Key 即可使用，无需手动指定模型。也支持在对话中指定特定模型
 - **灵活控制**：支持 `quality`（画质）、`size`（分辨率，512/1K~4K）、`aspect_ratio`（宽高比）等参数，各厂商自动适配有效值
 - **图片编辑**：传入已有图片即可进行编辑、风格迁移、多图融合
- **Skill 级配置**：支持通过 `config.json` 中的 `skill.image-generation.model` 固定默认模型
+- **Skill 级配置**：支持通过 `config.json` 中的 `skills.image-generation.model` 固定默认模型
 相关文档：[图像生成技能](https://docs.cowagent.ai/skills/image-generation)
--- a/docs/releases/v2.0.8.mdx
+++ b/docs/releases/v2.0.8.mdx
@@ -46,7 +46,7 @@ description: CowAgent 2.0.8 - 飞书渠道全面升级（语音、流式打字
 ## 🔧 工具与安全
- **图像识别模型**：让 `tool.vision.model` 配置真正生效，未配置时自动 fallback #2792 Thanks CNXudiandian
+- **图像识别模型**：让 `tools.vision.model` 配置真正生效，未配置时自动 fallback #2792 Thanks CNXudiandian
 - **Bash 安全确认**：仅对工作区外的破坏性删除做二次确认，工作区内常规操作不再打扰
 ## 🐛 其他修复
--- a/docs/skills/image-generation.mdx
+++ b/docs/skills/image-generation.mdx
@@ -88,7 +88,7 @@ description: 文生图 / 图生图 / 多图融合，支持多家厂商自动路
 如果想让所有图像生成固定走某个厂商的模型，可以在 `config.json` 里加：
 ```json
-"skill": {
+"skills": {
  "image-generation": {
    "model": "seedream-5.0-lite"
  }
--- a/docs/tools/vision.mdx
+++ b/docs/tools/vision.mdx
@@ -40,7 +40,7 @@ Vision 工具采用多级自动选择 + 自动兜底策略，无需手动配置
 ```json
 {
-    "tool": {
+    "tools": {
        "vision": {
            "model": "gpt-4.1"
        }
--- a/skills/image-generation/scripts/generate.py
+++ b/skills/image-generation/scripts/generate.py
@@ -1110,7 +1110,7 @@ def main():
    # Model resolution priority:
    #   1. Explicit `model` in the call args (agent / user override)
    #   2. SKILL_IMAGE_GENERATION_MODEL env var (synced from
-    #      config["skill"]["image-generation"]["model"] at startup)
+    #      config["skills"]["image-generation"]["model"] at startup)
    #   3. None → fall back to automatic provider routing (try every
    #      provider with a configured API key in global priority order)
    model = args.get("model") or os.environ.get("SKILL_IMAGE_GENERATION_MODEL") or ""
--- a/tests/test_qianfan_provider.py
+++ b/tests/test_qianfan_provider.py
@@ -394,7 +394,7 @@ class TestQianfanVisionTool(unittest.TestCase):
            "open_ai_api_key": "",
            "linkai_api_key": "",
            "use_linkai": False,
-            "tool": {},
+            "tools": {},
        }
        if values:
            data.update(values)
@@ -424,7 +424,7 @@ class TestQianfanVisionTool(unittest.TestCase):
    def test_vision_routes_ernie_model_override_to_qianfan(self):
        fake_conf = self._fake_conf({
            "qianfan_api_key": "test-qianfan-key",
-            "tool": {"vision": {"model": "ernie-4.5-turbo-vl-32k"}},
+            "tools": {"vision": {"model": "ernie-4.5-turbo-vl-32k"}},
        })
        fake_bot = MagicMock()
        fake_bot.call_vision = MagicMock()
--- a/voice/dashscope/dashscope_voice.py
+++ b/voice/dashscope/dashscope_voice.py
@@ -1,20 +1,13 @@
 # encoding:utf-8
-"""
+"""DashScope voice: qwen3-asr-flash (ASR) + qwen3-tts-flash (TTS)
-DashScope (Aliyun Bailian) voice service.
+via dashscope.MultiModalConversation."""
-
+import datetime
 ASR : qwen3-asr-flash via dashscope.MultiModalConversation
 TTS : not yet implemented (see CosyVoice / qwen3-tts)
 Why MultiModalConversation instead of the OpenAI-compatible endpoint:
  - SDK is already a project dep (used by chat/vision)
  - Native API accepts local file:// paths up to 100 QPS without an OSS
    round-trip, which is what we need for the "send a short voice
    message" flow. Public URLs / Base64 also work.
 """
 import os
 import random
 from typing import Optional
 import dashscope
 import requests
 from dashscope import MultiModalConversation
 from bridge.reply import Reply, ReplyType
@@ -25,16 +18,14 @@ from voice.voice import Voice
 DEFAULT_ASR_MODEL = "qwen3-asr-flash"
-# qwen3-asr-flash hard cap (single file, sync call). Longer audio needs
+DEFAULT_TTS_MODEL = "qwen3-tts-flash"
-# qwen3-asr-flash-filetrans which is async-only and out of scope here.
+DEFAULT_TTS_VOICE = "Cherry"
 MAX_DURATION_SECONDS = 300
 MAX_FILE_BYTES = 10 * 1024 * 1024
 class DashScopeVoice(Voice):
    def __init__(self):
        # api_key is applied per-call (chat bot does the same) so a live
        # config change via the web console takes effect without restart.
        pass
    def voiceToText(self, voice_file: str):
@@ -83,14 +74,72 @@ class DashScopeVoice(Voice):
            return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音，请稍后再试吧~")
    def textToVoice(self, text: str):
-        # TTS will be added in a follow-up commit (qwen3-tts / cosyvoice).
+        try:
-        return Reply(ReplyType.ERROR, "DashScope 语音合成尚未接入")
+            api_key = conf().get("dashscope_api_key", "")
            if not api_key:
                logger.error("[DashScopeVoice] dashscope_api_key is not configured")
                return Reply(ReplyType.ERROR, "未配置 DashScope API key")
            dashscope.api_key = api_key
            model = conf().get("text_to_voice_model") or DEFAULT_TTS_MODEL
            voice = conf().get("tts_voice_id") or DEFAULT_TTS_VOICE
            response = MultiModalConversation.call(
                model=model,
                api_key=api_key,
                text=text,
                voice=voice,
                stream=False,
            )
            url = self._extract_audio_url(response)
            if not url:
                logger.error(f"[DashScopeVoice] textToVoice failed: {response}")
                return Reply(ReplyType.ERROR, "语音合成失败")
            local_path = self._download_audio(url)
            if not local_path:
                return Reply(ReplyType.ERROR, "语音合成失败")
            logger.info(f"[DashScopeVoice] textToVoice model={model} voice={voice} file={local_path}")
            return Reply(ReplyType.VOICE, local_path)
        except Exception as e:
            logger.exception(f"[DashScopeVoice] textToVoice exception: {e}")
            return Reply(ReplyType.ERROR, "语音合成失败")
    @staticmethod
    def _extract_audio_url(response) -> Optional[str]:
        try:
            if getattr(response, "status_code", 200) != 200:
                return None
            audio = response.output.get("audio") if response.output else None
            if isinstance(audio, dict):
                return audio.get("url") or None
            return getattr(audio, "url", None)
        except Exception:
            return None
    @staticmethod
    def _download_audio(url: str) -> Optional[str]:
        try:
            tmp_dir = os.path.join(os.getcwd(), "tmp")
            os.makedirs(tmp_dir, exist_ok=True)
            ts = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
            ext = os.path.splitext(url.split("?", 1)[0])[1].lower() or ".wav"
            if ext not in (".mp3", ".wav", ".m4a", ".aac", ".opus"):
                ext = ".wav"
            dst = os.path.join(tmp_dir, f"dashscope_tts_{ts}_{random.randint(0, 9999)}{ext}")
            resp = requests.get(url, timeout=60)
            resp.raise_for_status()
            with open(dst, "wb") as f:
                f.write(resp.content)
            return dst
        except Exception as e:
            logger.error(f"[DashScopeVoice] download audio failed: {e}")
            return None
    @staticmethod
    def _ensure_compatible_format(voice_file: str) -> str:
-        """Convert AMR/SILK to mp3 since qwen3-asr-flash doesn't accept them.
+        # qwen3-asr-flash doesn't accept AMR/SILK; mp3/wav/m4a/aac/opus pass through.
        Other formats (mp3/wav/m4a/aac/opus/webm) are passed through.
        """
        lower = voice_file.lower()
        if lower.endswith(".amr") or lower.endswith(".silk") or lower.endswith(".slk"):
            try:
@@ -98,20 +147,11 @@ class DashScopeVoice(Voice):
                audio_convert.any_to_mp3(voice_file, mp3_file)
                return mp3_file
            except Exception as e:
-                logger.warning(
+                logger.warning(f"[DashScopeVoice] mp3 convert failed: {e}")
                    f"[DashScopeVoice] convert {voice_file} to mp3 failed: {e}; "
                    f"submitting original file"
                )
        return voice_file
    @staticmethod
    def _extract_text(response) -> Optional[str]:
        """Pull the recognized text out of MultiModalConversation response.
        Successful shape (result_format="message"):
          response.output.choices[0].message.content -> list of {"text": "..."}
          or in some SDK versions a plain string.
        """
        try:
            if getattr(response, "status_code", 200) != 200:
                return None
--- a/voice/linkai/linkai_voice.py
+++ b/voice/linkai/linkai_voice.py
@@ -1,16 +1,18 @@
-"""
+"""LinkAI voice: Whisper ASR + multi-vendor TTS (OpenAI / Doubao / Baidu)
-google voice service
+proxied via https://docs.link-ai.tech/platform/api/voice-speech."""
-"""
+import datetime
 import os
 import random
 import requests
-from voice import audio_convert
+
 from bridge.reply import Reply, ReplyType
 from common import const
 from common.log import logger
 from config import conf
 from voice import audio_convert
 from voice.voice import Voice
-from common import const
+
 import os
 import datetime
 class LinkAIVoice(Voice):
    def __init__(self):
@@ -21,8 +23,7 @@ class LinkAIVoice(Voice):
        try:
            url = conf().get("linkai_api_base", "https://api.link-ai.tech") + "/v1/audio/transcriptions"
            headers = {"Authorization": "Bearer " + conf().get("linkai_api_key")}
-            model = None
+            # Pin whisper-1: gateway ignores any other ASR model id.
            if not conf().get("text_to_voice") or conf().get("voice_to_text") == "openai":
            model = const.WHISPER_1
            if voice_file.endswith(".amr"):
                try:
@@ -30,54 +31,59 @@ class LinkAIVoice(Voice):
                    audio_convert.any_to_mp3(voice_file, mp3_file)
                    voice_file = mp3_file
                except Exception as e:
-                    logger.warn(f"[LinkVoice] amr file transfer failed, directly send amr voice file: {format(e)}")
+                    logger.warning(f"[LinkVoice] amr file transfer failed, directly send amr voice file: {e}")
-            file = open(voice_file, "rb")
+            with open(voice_file, "rb") as file:
-            file_body = {
+                res = requests.post(
-                "file": file
+                    url,
-            }
+                    files={"file": file},
-            data = {
+                    headers=headers,
-                "model": model
+                    data={"model": model},
-            }
+                    timeout=(5, 60),
-            res = requests.post(url, files=file_body, headers=headers, data=data, timeout=(5, 60))
+                )
-            if res.status_code == 200:
+            if res.status_code != 200:
-                text = res.json().get("text")
+                msg = ""
-            else:
+                try:
-                res_json = res.json()
+                    msg = res.json().get("message", "")
-                logger.error(f"[LinkVoice] voiceToText error, status_code={res.status_code}, msg={res_json.get('message')}")
+                except Exception:
                    pass
                logger.error(f"[LinkVoice] voiceToText error, status_code={res.status_code}, msg={msg}")
                return None
-            reply = Reply(ReplyType.TEXT, text)
+            text = res.json().get("text")
            logger.info(f"[LinkVoice] voiceToText success, text={text}, file name={voice_file}")
            return Reply(ReplyType.TEXT, text)
        except Exception as e:
            logger.error(e)
            return None
        return reply
    def textToVoice(self, text):
        try:
            url = conf().get("linkai_api_base", "https://api.link-ai.tech") + "/v1/audio/speech"
            headers = {"Authorization": "Bearer " + conf().get("linkai_api_key")}
-            model = const.TTS_1
+            # Gateway routes by `model` (tts-1 / doubao / baidu) + `voice` from
-            if not conf().get("text_to_voice") or conf().get("text_to_voice") in ["openai", const.TTS_1, const.TTS_1_HD]:
+            # that engine's catalog. `app_code` is optional workspace override.
                model = conf().get("text_to_voice_model") or const.TTS_1
            data = {
                "model": model,
                "input": text,
                "voice": conf().get("tts_voice_id"),
-                "app_code": conf().get("linkai_app_code")
+                "app_code": conf().get("linkai_app_code"),
            }
            model = conf().get("text_to_voice_model")
            if model:
                data["model"] = model
            res = requests.post(url, headers=headers, json=data, timeout=(5, 120))
-            if res.status_code == 200:
+            if res.status_code != 200:
                msg = ""
                try:
                    msg = res.json().get("message", "")
                except Exception:
                    pass
                logger.error(f"[LinkVoice] textToVoice error, status_code={res.status_code}, msg={msg}")
                return None
            tmp_file_name = "tmp/" + datetime.datetime.now().strftime('%Y%m%d%H%M%S') + str(random.randint(0, 1000)) + ".mp3"
            os.makedirs(os.path.dirname(tmp_file_name), exist_ok=True)
            with open(tmp_file_name, 'wb') as f:
                f.write(res.content)
-                reply = Reply(ReplyType.VOICE, tmp_file_name)
+            logger.info(f"[LinkVoice] textToVoice success, input={text}, voice_id={data.get('voice')}")
-                logger.info(f"[LinkVoice] textToVoice success, input={text}, model={model}, voice_id={data.get('voice')}")
+            return Reply(ReplyType.VOICE, tmp_file_name)
                return reply
            else:
                res_json = res.json()
                logger.error(f"[LinkVoice] textToVoice error, status_code={res.status_code}, msg={res_json.get('message')}")
                return None
        except Exception as e:
            logger.error(e)
            # reply = Reply(ReplyType.ERROR, "遇到了一点小问题，请稍后再问我吧")
            return None
--- a/voice/minimax/minimax_voice.py
+++ b/voice/minimax/minimax_voice.py
@@ -1,8 +1,7 @@
 # encoding:utf-8
-"""
+"""MiniMax TTS via /v1/t2a_v2 (SSE stream, hex-encoded mp3 chunks)."""
 MiniMax TTS voice service
 """
 import datetime
 import json
 import random
 import requests
@@ -12,24 +11,12 @@ from config import conf
 from voice.voice import Voice
 MINIMAX_TTS_VOICES = [
    "English_Graceful_Lady",
    "English_Insightful_Speaker",
    "English_radiant_girl",
    "English_Persuasive_Man",
    "English_Lucky_Robot",
    "English_expressive_narrator",
    "Chinese_Warm_Woman",
    "Chinese_Gentle_Man",
 ]
 class MinimaxVoice(Voice):
    def __init__(self):
        self.api_key = conf().get("minimax_api_key")
-        self.api_base = conf().get("minimax_api_base") or "https://api.minimax.io"
+        # Mainland endpoint matches `sk-api-0-...` keys; override via
-        # Strip trailing /v1 if present so we can always append /v1/t2a_v2
+        # `minimax_api_base` for international (api.minimax.io) workspaces.
-        self.api_base = self.api_base.rstrip("/")
+        self.api_base = (conf().get("minimax_api_base") or "https://api.minimaxi.com").rstrip("/")
        if self.api_base.endswith("/v1"):
            self.api_base = self.api_base[:-3]
@@ -68,12 +55,14 @@ class MinimaxVoice(Voice):
            response = requests.post(url, headers=headers, json=payload, stream=True, timeout=60)
            response.raise_for_status()
-            # Parse SSE stream and collect hex-encoded audio chunks
+            # MiniMax returns HTTP 200 even on errors; capture base_resp for diagnostics.
            audio_chunks = []
-            buffer = ""
+            last_base_resp = None
            event_count = 0
            for raw in response.iter_lines():
                if not raw:
                    continue
                event_count += 1
                line = raw.decode("utf-8") if isinstance(raw, bytes) else raw
                if not line.startswith("data:"):
                    continue
@@ -81,16 +70,31 @@ class MinimaxVoice(Voice):
                if not json_str or json_str == "[DONE]":
                    continue
                try:
                    import json
                    event_data = json.loads(json_str)
                    audio_hex = event_data.get("data", {}).get("audio")
                    if audio_hex:
                        audio_chunks.append(bytes.fromhex(audio_hex))
                except Exception:
                    continue
                base_resp = event_data.get("base_resp") or {}
                if base_resp:
                    last_base_resp = base_resp
                audio_hex = (event_data.get("data") or {}).get("audio")
                if audio_hex:
                    try:
                        audio_chunks.append(bytes.fromhex(audio_hex))
                    except Exception as e:
                        logger.warning(f"[MINIMAX] skip bad audio hex chunk: {e}")
            if not audio_chunks:
-                logger.error("[MINIMAX] TTS returned no audio data")
+                ct = response.headers.get("Content-Type", "")
                if last_base_resp and last_base_resp.get("status_code") not in (None, 0):
                    logger.error(
                        f"[MINIMAX] TTS failed: status_code={last_base_resp.get('status_code')}, "
                        f"status_msg={last_base_resp.get('status_msg')}, model={model}, voice_id={voice_id}"
                    )
                else:
                    logger.error(
                        f"[MINIMAX] TTS returned no audio data, model={model}, voice_id={voice_id}, "
                        f"url={url}, http={response.status_code}, content_type={ct!r}, events={event_count}"
                    )
                return Reply(ReplyType.ERROR, "语音合成失败，未获取到音频数据")
            audio_data = b"".join(audio_chunks)
--- a/voice/openai/openai_voice.py
+++ b/voice/openai/openai_voice.py
@@ -31,7 +31,8 @@ class OpenaiVoice(Voice):
                "file": file,
            }
            data = {
-                "model": "whisper-1",
+                # Override via `voice_to_text_model` (e.g. fall back to whisper-1).
                "model": conf().get("voice_to_text_model") or "gpt-4o-mini-transcribe",
            }
            response = requests.post(url, headers=headers, files=files, data=data)
            response_data = response.json()
--- a/voice/zhipuai/zhipuai_voice.py
+++ b/voice/zhipuai/zhipuai_voice.py
@@ -1,14 +1,8 @@
 # encoding:utf-8
-"""
+"""ZhipuAI voice: glm-asr-2512 (ASR) + glm-tts (TTS) via BigModel REST API."""
-ZhipuAI (BigModel) voice service.
+import datetime
 ASR : glm-asr-2512 via the OpenAI-compatible /audio/transcriptions endpoint.
 TTS : not yet implemented.
 Endpoint accepts multipart/form-data with `model`, `file`, and `stream`.
 File size <= 25MB, duration <= 30s per request.
 """
 import os
 import random
 import requests
@@ -20,6 +14,8 @@ from voice.voice import Voice
 DEFAULT_ASR_MODEL = "glm-asr-2512"
 DEFAULT_TTS_MODEL = "glm-tts"
 DEFAULT_TTS_VOICE = "tongtong"
 DEFAULT_API_BASE = "https://open.bigmodel.cn/api/paas/v4"
 MAX_FILE_BYTES = 25 * 1024 * 1024
 REQUEST_TIMEOUT = (5, 60)
@@ -27,7 +23,6 @@ REQUEST_TIMEOUT = (5, 60)
 class ZhipuAIVoice(Voice):
    def __init__(self):
        # api_key / base read per-call so live config edits take effect.
        pass
    def voiceToText(self, voice_file: str):
@@ -81,12 +76,91 @@ class ZhipuAIVoice(Voice):
            return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音，请稍后再试吧~")
    def textToVoice(self, text: str):
-        return Reply(ReplyType.ERROR, "ZhipuAI 语音合成尚未接入")
+        try:
            api_key = conf().get("zhipu_ai_api_key", "")
            if not api_key:
                logger.error("[ZhipuAIVoice] zhipu_ai_api_key is not configured")
                return Reply(ReplyType.ERROR, "未配置 ZhipuAI API key")
            api_base = (conf().get("zhipu_ai_api_base") or DEFAULT_API_BASE).rstrip("/")
            url = f"{api_base}/audio/speech"
            model = conf().get("text_to_voice_model") or DEFAULT_TTS_MODEL
            voice_id = conf().get("tts_voice_id") or DEFAULT_TTS_VOICE
            payload = {
                "model": model,
                "input": text,
                "voice": voice_id,
                "response_format": "wav",
                "speed": 1.0,
                "volume": 1.0,
            }
            headers = {
                "Authorization": f"Bearer {api_key}",
                "Content-Type": "application/json",
            }
            response = requests.post(
                url, headers=headers, json=payload, timeout=REQUEST_TIMEOUT
            )
            if response.status_code != 200:
                logger.error(
                    f"[ZhipuAIVoice] textToVoice failed: status={response.status_code} "
                    f"body={response.text[:500]} model={model} voice={voice_id}"
                )
                return Reply(ReplyType.ERROR, "语音合成失败，请稍后再试")
            # Some errors come back as JSON / SSE with HTTP 200.
            ct = response.headers.get("Content-Type", "")
            if "application/json" in ct or "text/event-stream" in ct:
                try:
                    err = response.json()
                except Exception:
                    err = {"raw": response.text[:500]}
                logger.error(
                    f"[ZhipuAIVoice] textToVoice unexpected text response "
                    f"(content_type={ct}): {err}"
                )
                return Reply(ReplyType.ERROR, "语音合成失败，请稍后再试")
            audio_bytes = response.content
            ext = self._sniff_audio_ext(audio_bytes) or "wav"
            file_name = (
                "tmp/" + datetime.datetime.now().strftime("%Y%m%d%H%M%S")
                + str(random.randint(0, 1000)) + "." + ext
            )
            os.makedirs(os.path.dirname(file_name), exist_ok=True)
            with open(file_name, "wb") as f:
                f.write(audio_bytes)
            logger.info(
                f"[ZhipuAIVoice] textToVoice model={model} voice={voice_id} "
                f"file={file_name} bytes={len(audio_bytes)} ext={ext}"
            )
            return Reply(ReplyType.VOICE, file_name)
        except Exception as e:
            logger.exception(f"[ZhipuAIVoice] textToVoice exception: {e}")
            return Reply(ReplyType.ERROR, "语音合成失败，请稍后再试")
    @staticmethod
    def _sniff_audio_ext(data: bytes) -> str:
        """Detect audio container by magic bytes; returns '' on unknown."""
        if len(data) < 12:
            return ""
        head = data[:12]
        if head[:4] == b"RIFF" and head[8:12] == b"WAVE":
            return "wav"
        if head[:3] == b"ID3" or head[:2] == b"\xff\xfb" or head[:2] == b"\xff\xf3" or head[:2] == b"\xff\xf2":
            return "mp3"
        if head[:4] == b"OggS":
            return "ogg"
        if head[:4] == b"fLaC":
            return "flac"
        return ""
    @staticmethod
    def _ensure_compatible_format(voice_file: str) -> str:
-        # glm-asr-2512 only accepts .wav / .mp3 — convert everything else
+        # glm-asr-2512 only accepts .wav / .mp3
        # (webm from the browser mic, m4a/amr/silk from chat channels, etc).
        lower = voice_file.lower()
        if lower.endswith(".mp3") or lower.endswith(".wav"):
            return voice_file
@@ -95,8 +169,5 @@ class ZhipuAIVoice(Voice):
            audio_convert.any_to_mp3(voice_file, mp3_file)
            return mp3_file
        except Exception as e:
-            logger.warning(
+            logger.warning(f"[ZhipuAIVoice] mp3 convert failed: {e}")
                f"[ZhipuAIVoice] convert {voice_file} to mp3 failed: {e}; "
                f"submitting original file"
            )
            return voice_file