feat(voice): rework TTS/ASR stack and unify tool/skill config schema

2026-07-17 11:07:11 +08:00 · 2026-05-21 16:00:54 +08:00
parent 2b90f377e6
commit b8333e351c
31 changed files with 1551 additions and 335 deletions
--- a/channel/chat_channel.py
+++ b/channel/chat_channel.py
@@ -171,7 +171,13 @@ class ChatChannel(Channel):
            if "desire_rtype" not in context and conf().get("always_reply_voice") and ReplyType.VOICE not in self.NOT_SUPPORT_REPLYTYPE:
                context["desire_rtype"] = ReplyType.VOICE
        elif context.type == ContextType.VOICE:
-            if "desire_rtype" not in context and conf().get("voice_reply_voice") and ReplyType.VOICE not in self.NOT_SUPPORT_REPLYTYPE:
+            # Voice input replies with voice when either voice_reply_voice
+            # (mirror voice) or the global always_reply_voice toggle is on.
+            if (
+                "desire_rtype" not in context
+                and (conf().get("voice_reply_voice") or conf().get("always_reply_voice"))
+                and ReplyType.VOICE not in self.NOT_SUPPORT_REPLYTYPE
+            ):
                context["desire_rtype"] = ReplyType.VOICE
        return context

--- a/channel/feishu/feishu_channel.py
+++ b/channel/feishu/feishu_channel.py
@@ -1515,10 +1515,16 @@ class FeiShuChanel(ChatChannel):
            else:
                context.type = ContextType.TEXT
            context.content = content.strip()
+            # Text input opts into voice replies only when the always-on toggle is set.
+            if "desire_rtype" not in context and conf().get("always_reply_voice"):
+                context["desire_rtype"] = ReplyType.VOICE

        elif context.type == ContextType.VOICE:
-            # 2.语音请求
-            if "desire_rtype" not in context and conf().get("voice_reply_voice"):
+            # 2.语音请求: voice input replies with voice if either
+            # voice_reply_voice (mirror reply) or always_reply_voice is on.
+            if "desire_rtype" not in context and (
+                conf().get("voice_reply_voice") or conf().get("always_reply_voice")
+            ):
                context["desire_rtype"] = ReplyType.VOICE

        return context
--- a/channel/web/static/css/console.css
+++ b/channel/web/static/css/console.css
@@ -1294,3 +1294,76 @@
    overflow: hidden;
    min-height: 2.5em;  /* ~2 lines at text-sm leading-relaxed */
 }
+
+/* --------------------------------------------------------------------
+ * Voice pill — compact custom audio player used by mic uploads and TTS
+ * replies. Replaces the bulky native <audio controls> with a play/pause
+ * icon + thin progress bar + duration counter so it blends into chat
+ * bubbles without the chrome-grey browser default look.
+ * ------------------------------------------------------------------ */
+.voice-pill {
+    display: inline-flex;
+    align-items: center;
+    gap: 8px;
+    padding: 6px 10px;
+    border-radius: 999px;
+    background: rgba(15, 23, 42, 0.05);
+    color: rgb(71, 85, 105);
+    font-size: 12px;
+    line-height: 1;
+    max-width: 240px;
+    user-select: none;
+    cursor: default;
+}
+.dark .voice-pill {
+    background: rgba(255, 255, 255, 0.08);
+    color: rgb(203, 213, 225);
+}
+.voice-pill[data-loading="1"] {
+    opacity: 0.65;
+}
+.voice-pill-btn {
+    width: 22px;
+    height: 22px;
+    border-radius: 999px;
+    display: inline-flex;
+    align-items: center;
+    justify-content: center;
+    background: var(--color-primary-500, #2563eb);
+    color: #fff;
+    flex-shrink: 0;
+    cursor: pointer;
+    transition: transform 0.1s ease;
+}
+.voice-pill-btn:hover { transform: scale(1.05); }
+.voice-pill-btn i { font-size: 9px; margin-left: 1px; }
+.voice-pill-btn[data-state="play"] i { margin-left: 2px; }
+.voice-pill-btn[data-state="pause"] i { margin-left: 0; }
+.voice-pill-track {
+    flex: 1;
+    height: 3px;
+    border-radius: 999px;
+    background: rgba(100, 116, 139, 0.25);
+    overflow: hidden;
+    min-width: 70px;
+}
+.dark .voice-pill-track {
+    background: rgba(148, 163, 184, 0.25);
+}
+.voice-pill-fill {
+    height: 100%;
+    width: 0%;
+    background: var(--color-primary-500, #2563eb);
+    border-radius: inherit;
+    transition: width 0.1s linear;
+}
+.voice-pill-time {
+    font-variant-numeric: tabular-nums;
+    font-size: 11px;
+    color: inherit;
+    opacity: 0.75;
+    flex-shrink: 0;
+    min-width: 28px;
+    text-align: right;
+}
+.voice-pill audio { display: none; }
--- a/channel/web/static/js/console.js
+++ b/channel/web/static/js/console.js
@@ -25,6 +25,7 @@ const I18N = {
        models_add_vendor: '添加厂商',
        models_provider: '厂商',
        models_model: '模型',
+        models_voice: '音色',
        models_configured: '已配置',
        models_not_configured: '未配置',
        models_pick_to_configure: '选择以配置',
@@ -160,6 +161,11 @@ const I18N = {
        mic_permission_denied: '无法访问麦克风，请检查浏览器权限',
        mic_too_short: '录音太短，请重试',
        mic_error: '语音识别失败',
+        speak_msg: '朗读这段回复',
+        voice_reply_mode_label: '语音回复策略',
+        voice_reply_off: '关闭',
+        voice_reply_if_voice: '仅语音问/语音答',
+        voice_reply_always: '总是语音回复',
        attach_menu_folder: '上传文件夹',
        confirm_yes: '确认',
        confirm_cancel: '取消',
@@ -180,6 +186,7 @@ const I18N = {
        models_add_vendor: 'Add Vendor',
        models_provider: 'Provider',
        models_model: 'Model',
+        models_voice: 'Voice',
        models_configured: 'configured',
        models_not_configured: 'not configured',
        models_pick_to_configure: 'pick to configure',
@@ -315,6 +322,11 @@ const I18N = {
        mic_permission_denied: 'Cannot access microphone — check browser permissions',
        mic_too_short: 'Recording too short, please retry',
        mic_error: 'Speech recognition failed',
+        speak_msg: 'Read this reply aloud',
+        voice_reply_mode_label: 'Voice reply policy',
+        voice_reply_off: 'Off',
+        voice_reply_if_voice: 'Voice only if voice input',
+        voice_reply_always: 'Always reply with voice',
        attach_menu_folder: 'Upload Folder',
        confirm_yes: 'Confirm',
        confirm_cancel: 'Cancel',
@@ -1474,6 +1486,7 @@ function sendVoiceMessage(text, audioUrl) {
        message: text,
        stream: true,
        timestamp: timestamp.toISOString(),
+        is_voice: true,
    };

    const MAX_RETRIES = 2;
@@ -1512,19 +1525,19 @@ function sendVoiceMessage(text, audioUrl) {
 function addUserVoiceMessage(audioUrl, caption, timestamp) {
    const el = document.createElement('div');
    el.className = 'flex justify-end px-4 sm:px-6 py-3';
-    // Voice-message bubble: playable <audio> on top, ASR caption beneath.
+    // Voice-message bubble: compact voice pill on top, ASR caption beneath.
    // The bubble keeps the same primary tint as a normal user message so
    // it visually slots into the conversation flow.
    el.innerHTML = `
        <div class="max-w-[75%] sm:max-w-[60%]">
            <div class="bg-slate-100 dark:bg-white/10 text-slate-700 dark:text-slate-200 rounded-2xl px-3 py-2 msg-content user-bubble">
-                <audio controls preload="metadata" src="${audioUrl}"
-                       class="block w-[260px] max-w-full h-9"></audio>
+                <div class="user-voice-slot"></div>
                ${caption ? `<div class="text-xs mt-1.5 leading-snug text-slate-500 dark:text-slate-400 whitespace-pre-wrap break-words">${escapeHtml(caption)}</div>` : ''}
            </div>
            <div class="text-xs text-slate-400 dark:text-slate-500 mt-1.5 text-right">${formatTime(timestamp)}</div>
        </div>
    `;
+    el.querySelector('.user-voice-slot').appendChild(renderVoicePill(audioUrl));
    messagesDiv.appendChild(el);
    _autoScrollEnabled = true;
    scrollChatToBottom(true);
@@ -1639,12 +1652,16 @@ function startSSE(requestId, loadingEl, timestamp, titleInfo) {
                    <div class="agent-steps"></div>
                    <div class="answer-content sse-streaming"></div>
                    <div class="media-content"></div>
+                    <div class="bot-audio-slot"></div>
                </div>
                <div class="flex items-center gap-2 mt-1.5">
                    <span class="text-xs text-slate-400 dark:text-slate-500">${formatTime(timestamp)}</span>
                    <button class="copy-msg-btn text-xs text-slate-300 dark:text-slate-600 hover:text-slate-500 dark:hover:text-slate-400 transition-colors cursor-pointer" title="${currentLang === 'zh' ? '复制' : 'Copy'}" style="display:none">
                        <i class="fas fa-copy"></i>
                    </button>
+                    <button class="speak-msg-btn text-xs text-slate-300 dark:text-slate-600 hover:text-slate-500 dark:hover:text-slate-400 transition-colors cursor-pointer" title="${t('speak_msg')}" style="display:none;">
+                        <i class="fas fa-volume-up"></i>
+                    </button>
                </div>
            </div>
        `;
@@ -1856,11 +1873,12 @@ function startSSE(requestId, loadingEl, timestamp, titleInfo) {
                scrollChatToBottom();

            } else if (item.type === 'done') {
+                // Don't close the stream yet: the backend keeps it open
+                // for a short tail to deliver async attachments such as
+                // TTS audio (`voice_attach`). It will close the stream on
+                // its own via onerror once the tail expires.
                done = true;
-                es.close();
-                delete activeStreams[requestId];

-                // item.content may be empty when "done" is only a stream-close signal after media.
                const finalText = item.content || accumulatedText;

                if (!botEl && finalText) {
@@ -1874,6 +1892,7 @@ function startSSE(requestId, loadingEl, timestamp, titleInfo) {
                    if (copyBtn && finalText) copyBtn.style.display = '';
                    applyHighlighting(botEl);
                }
+                renderBotSpeakerButton(botEl, finalText);
                scrollChatToBottom();

                if (titleInfo) {
@@ -1883,6 +1902,15 @@ function startSSE(requestId, loadingEl, timestamp, titleInfo) {
                    loadSessionList();
                }

+            } else if (item.type === 'voice_attach') {
+                // TTS finished — attach a playable audio element to the
+                // current bot bubble. The stream closes right after.
+                if (botEl && item.url) {
+                    attachAudioToBotBubble(botEl, item.url, { autoplay: true });
+                }
+                es.close();
+                delete activeStreams[requestId];
+
            } else if (item.type === 'error') {
                done = true;
                es.close();
@@ -1896,7 +1924,10 @@ function startSSE(requestId, loadingEl, timestamp, titleInfo) {
            es.close();
            delete activeStreams[requestId];

-            if (done) return;
+            if (done) {
+                // Normal close after the post-done tail expired; nothing to do.
+                return;
+            }

            if (currentReasoningEl) {
                finalizeThinking(currentReasoningEl, reasoningStartTime, reasoningText);
@@ -2187,21 +2218,174 @@ function createBotMessageEl(content, timestamp, requestId, msg) {
            <div class="bg-white dark:bg-[#1A1A1A] border border-slate-200 dark:border-white/10 rounded-2xl px-4 py-3 text-sm leading-relaxed msg-content text-slate-700 dark:text-slate-200">
                ${stepsHtml ? `<div class="agent-steps">${stepsHtml}</div>` : ''}
                <div class="answer-content">${renderMarkdown(displayContent)}</div>
+                <div class="bot-audio-slot"></div>
            </div>
            <div class="flex items-center gap-2 mt-1.5">
                <span class="text-xs text-slate-400 dark:text-slate-500">${formatTime(timestamp)}</span>
                <button class="copy-msg-btn text-xs text-slate-300 dark:text-slate-600 hover:text-slate-500 dark:hover:text-slate-400 transition-colors cursor-pointer" title="${currentLang === 'zh' ? '复制' : 'Copy'}">
                    <i class="fas fa-copy"></i>
                </button>
+                <button class="speak-msg-btn text-xs text-slate-300 dark:text-slate-600 hover:text-slate-500 dark:hover:text-slate-400 transition-colors cursor-pointer" title="${t('speak_msg')}" style="display:none;">
+                    <i class="fas fa-volume-up"></i>
+                </button>
            </div>
        </div>
    `;
    el.querySelector('.answer-content').dataset.rawMd = displayContent;
+    // Existing TTS attachment (history replay): mount the player up-front.
+    const existingAudio = msg && msg.extras && msg.extras.audio && msg.extras.audio.url;
+    if (existingAudio) {
+        attachAudioToBotBubble(el, existingAudio, { autoplay: false });
+    }
+    renderBotSpeakerButton(el, displayContent);
    applyHighlighting(el);
    bindChatKnowledgeLinks(el);
    return el;
 }

+// Append (or replace) a small audio player inside a bot bubble's
+// dedicated `.bot-audio-slot`. Used by both live TTS pushes and history
+// replay. Silent failures: never throws.
+function attachAudioToBotBubble(botEl, audioUrl, opts) {
+    try {
+        if (!botEl || !audioUrl) return;
+        const slot = botEl.querySelector('.bot-audio-slot');
+        if (!slot) return;
+        slot.innerHTML = '';
+        slot.style.marginTop = '6px';
+        const pill = renderVoicePill(audioUrl, { autoplay: !!(opts && opts.autoplay) });
+        slot.appendChild(pill);
+        const speakBtn = botEl.querySelector('.speak-msg-btn');
+        if (speakBtn) speakBtn.style.display = 'none';
+    } catch (_) { /* silent */ }
+}
+
+// Build a compact play/pause + progress + duration pill that wraps a
+// hidden <audio>. Returns the root element; safe to embed anywhere.
+function renderVoicePill(audioUrl, opts) {
+    opts = opts || {};
+    const wrap = document.createElement('div');
+    wrap.className = 'voice-pill';
+    wrap.innerHTML = `
+        <button type="button" class="voice-pill-btn" data-state="play" aria-label="play">
+            <i class="fas fa-play"></i>
+        </button>
+        <div class="voice-pill-track"><div class="voice-pill-fill"></div></div>
+        <span class="voice-pill-time">0:00</span>
+        <audio preload="metadata" src="${audioUrl}"></audio>
+    `;
+    const btn = wrap.querySelector('.voice-pill-btn');
+    const fill = wrap.querySelector('.voice-pill-fill');
+    const timeEl = wrap.querySelector('.voice-pill-time');
+    const audio = wrap.querySelector('audio');
+
+    const fmt = (s) => {
+        if (!isFinite(s) || s < 0) s = 0;
+        const m = Math.floor(s / 60);
+        const r = Math.floor(s % 60);
+        return `${m}:${r < 10 ? '0' : ''}${r}`;
+    };
+    const setIcon = (state) => {
+        btn.dataset.state = state;
+        btn.querySelector('i').className = state === 'pause' ? 'fas fa-pause' : 'fas fa-play';
+        btn.setAttribute('aria-label', state === 'pause' ? 'pause' : 'play');
+    };
+
+    audio.addEventListener('loadedmetadata', () => {
+        if (audio.duration && isFinite(audio.duration)) timeEl.textContent = fmt(audio.duration);
+    });
+    audio.addEventListener('timeupdate', () => {
+        const dur = audio.duration || 0;
+        if (dur > 0) {
+            fill.style.width = `${Math.min(100, (audio.currentTime / dur) * 100)}%`;
+            timeEl.textContent = fmt(dur - audio.currentTime);
+        }
+    });
+    audio.addEventListener('ended', () => {
+        setIcon('play');
+        fill.style.width = '0%';
+        timeEl.textContent = fmt(audio.duration || 0);
+    });
+    audio.addEventListener('play',  () => setIcon('pause'));
+    audio.addEventListener('pause', () => setIcon('play'));
+
+    btn.addEventListener('click', (e) => {
+        e.stopPropagation();
+        if (audio.paused) {
+            audio.play().catch(() => {});
+        } else {
+            audio.pause();
+        }
+    });
+
+    if (opts.autoplay) {
+        // Autoplay may be blocked by the browser; fall back silently and
+        // let the user tap the play button.
+        const tryPlay = () => audio.play().catch(() => {});
+        if (audio.readyState >= 2) tryPlay();
+        else audio.addEventListener('canplay', tryPlay, { once: true });
+    }
+    return wrap;
+}
+
+// Show the manual "read aloud" button when TTS is configured but the
+// bubble has no audio yet. Lazily probes capability via /api/models so
+// we don't expose the button when nothing can synthesize speech.
+function renderBotSpeakerButton(botEl, text) {
+    if (!botEl || !text || !text.trim()) return;
+    const btn = botEl.querySelector('.speak-msg-btn');
+    if (!btn) return;
+    if (botEl.querySelector('.bot-audio-slot audio')) return;
+    _isTtsReady().then(ready => {
+        if (!ready) return;
+        btn.style.display = '';
+        btn.onclick = () => _triggerManualTts(btn, botEl, text);
+    });
+}
+
+let _ttsReadyPromise = null;
+let _ttsReadyTs = 0;
+function _isTtsReady() {
+    // Cache for 30s to avoid hammering /api/models on every bubble.
+    if (_ttsReadyPromise && Date.now() - _ttsReadyTs < 30000) {
+        return _ttsReadyPromise;
+    }
+    _ttsReadyTs = Date.now();
+    _ttsReadyPromise = fetch('/api/models')
+        .then(r => r.json())
+        .then(data => {
+            const tts = data && data.capabilities && data.capabilities.tts;
+            if (!tts) return false;
+            return Boolean(tts.current_provider || tts.suggested_provider);
+        })
+        .catch(() => false);
+    return _ttsReadyPromise;
+}
+
+function _triggerManualTts(btn, botEl, text) {
+    if (btn.dataset.busy === '1') return;
+    btn.dataset.busy = '1';
+    const icon = btn.querySelector('i');
+    const prev = icon ? icon.className : '';
+    if (icon) icon.className = 'fas fa-spinner fa-spin';
+    fetch('/api/voice/tts', {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify({ text, session_id: sessionId }),
+    })
+        .then(r => r.json())
+        .then(data => {
+            if (data && data.status === 'success' && data.audio_url) {
+                attachAudioToBotBubble(botEl, data.audio_url, { autoplay: true });
+            }
+        })
+        .catch(() => {})
+        .finally(() => {
+            btn.dataset.busy = '0';
+            if (icon) icon.className = prev || 'fas fa-volume-up';
+        });
+}
+
 function addUserMessage(content, timestamp, attachments) {
    const el = createUserMessageEl(content, timestamp, attachments);
    messagesDiv.appendChild(el);
@@ -3842,14 +4026,39 @@ function renderCapabilityBody(def, cap, body) {

    body.innerHTML = providerHtml + modelHtml + dimHtml + footer;

-    // The body subtree is detached from `document` at this moment (the parent
-    // wrap is not yet appended), so we must scope lookups to `body` rather
-    // than calling document.getElementById, which would return null and crash
-    // initDropdown's internal querySelector.
+    // TTS: mount reply-mode above provider; defer off-mode toggle to the end.
+    if (def.id === 'tts') {
+        renderVoiceReplyMode(body, cap.reply_mode || 'off', { skipVisibilityToggle: true });
+        // Voice-timbre picker depends on provider+model; rebuilt by callbacks.
+        const modelWrap = body.querySelector(`#cap-${def.id}-model-wrap`);
+        if (modelWrap) {
+            const voiceWrap = document.createElement('div');
+            voiceWrap.id = `cap-${def.id}-voice-wrap`;
+            voiceWrap.innerHTML = `
+                <label class="block text-sm font-medium text-slate-600 dark:text-slate-400 mb-1.5">${t('models_voice')}</label>
+                <div id="cap-${def.id}-voice" class="cfg-dropdown" tabindex="0">
+                    <div class="cfg-dropdown-selected">
+                        <span class="cfg-dropdown-text">--</span>
+                        <i class="fas fa-chevron-down cfg-dropdown-arrow"></i>
+                    </div>
+                    <div class="cfg-dropdown-menu"></div>
+                </div>
+                <div id="cap-${def.id}-voice-custom-wrap" class="hidden mt-2">
+                    <input id="cap-${def.id}-voice-custom" type="text"
+                           class="w-full px-3 py-2 text-sm rounded-md border border-slate-200 dark:border-slate-700
+                                  bg-white dark:bg-slate-800 text-slate-700 dark:text-slate-200
+                                  placeholder:text-slate-400 dark:placeholder:text-slate-500
+                                  focus:outline-none focus:ring-2 focus:ring-primary-500"
+                           placeholder="voice id" />
+                </div>
+            `;
+            modelWrap.parentNode.insertBefore(voiceWrap, modelWrap.nextSibling);
+        }
+    }
+
+    // `body` is still detached from `document`; scope lookups locally.
    const provDd = body.querySelector(`#cap-${def.id}-provider`);
-    // initDropdown's option shape is {value, label}; we strip our private
-    // _configured/_tracked fields before handing it over so the helper stays
-    // generic, then re-attach status decorations afterwards.
+    // Strip private fields before handing to the generic initDropdown helper.
    const ddOpts = providerOpts.map(o => ({ value: o.value, label: o.label }));

    let pendingProvider = null;
@@ -3860,15 +4069,9 @@ function renderCapabilityBody(def, cap, body) {
        pendingCapabilitySelection = null;
    }

-    // For auto-capable capabilities, an "auto" strategy means the user has
-    // not pinned a vendor; we honor that by selecting the empty-string
-    // sentinel rather than the resolved fallback provider name.
-    // `suggested_provider` is a UI-only preselect (used by embedding & ASR)
-    // when the user has not pinned a vendor yet — purely cosmetic, not
-    // persisted until the user clicks Save.
-    // For "pick or empty" capabilities (no current, no suggestion), we leave
-    // the dropdown unselected and show a muted placeholder so the user is
-    // nudged to pick explicitly.
+    // Auto strategy => leave empty sentinel selected. `suggested_provider`
+    // is a UI-only preselect (not persisted until the user clicks Save).
+    // No current + no suggestion => leave unselected with a placeholder.
    const noSelectionAndNoHint = !cap.current_provider && !cap.suggested_provider;
    const initialProviderValue = pendingProvider
        ? pendingProvider
@@ -3889,20 +4092,82 @@ function renderCapabilityBody(def, cap, body) {

    if (def.needsModel) {
        rebuildCapabilityModelDropdown(def, initialProviderValue, cap.current_model || '', body);
-        // Hide the model picker entirely while the capability is in `auto`
-        // mode — there is nothing useful to pin, and the fallback hint
-        // below explains what'll actually run.
+        // Hide model picker in auto mode — fallback hint below covers it.
        setCapabilityModelPickerVisible(def, initialProviderValue !== '' || !capabilitySupportsAuto(def.id), body);
    }

+    if (def.id === 'tts') {
+        rebuildCapabilityVoiceDropdown(
+            initialProviderValue,
+            cap.current_voice || '',
+            body,
+            cap.current_model || ''
+        );
+    }
+
    // Inject auto/router-pending hint banners before the action footer.
    renderCapabilityHints(def, cap, body, initialProviderValue);
+
+    if (def.id === 'tts') {
+        _setTtsConfigVisible(body, (cap.reply_mode || 'off') !== 'off');
+    }
 }

-// Toggle visibility of the model picker. Used both at first render and
-// whenever the provider dropdown swings between an explicit vendor and the
-// "auto" sentinel. We toggle the wrapper rather than re-rendering so the
-// existing dropdown state survives a round-trip back to a real vendor.
+// TTS reply-policy dropdown (off / voice_if_voice / always). Persists on
+// change. When off, hides the rest of the TTS card.
+function renderVoiceReplyMode(host, currentMode, options) {
+    options = options || {};
+    const opts = [
+        { value: 'off',            label: t('voice_reply_off') },
+        { value: 'voice_if_voice', label: t('voice_reply_if_voice') },
+        { value: 'always',         label: t('voice_reply_always') },
+    ];
+    const wrap = document.createElement('div');
+    wrap.id = 'voice-reply-mode-wrap';
+    wrap.innerHTML = `
+        <label class="block text-sm font-medium text-slate-600 dark:text-slate-400 mb-1.5">${t('voice_reply_mode_label')}</label>
+        <div id="voice-reply-mode-dd" class="cfg-dropdown" tabindex="0">
+            <div class="cfg-dropdown-selected">
+                <span class="cfg-dropdown-text">--</span>
+                <i class="fas fa-chevron-down cfg-dropdown-arrow"></i>
+            </div>
+            <div class="cfg-dropdown-menu"></div>
+        </div>
+    `;
+    host.prepend(wrap);
+
+    const dd = wrap.querySelector('#voice-reply-mode-dd');
+    const valid = ['off', 'voice_if_voice', 'always'];
+    const initial = valid.includes(currentMode) ? currentMode : 'off';
+    if (!options.skipVisibilityToggle) _setTtsConfigVisible(host, initial !== 'off');
+    initDropdown(dd, opts, initial, (mode) => {
+        if (!valid.includes(mode)) return;
+        _setTtsConfigVisible(host, mode !== 'off');
+        fetch('/api/models', {
+            method: 'POST',
+            headers: { 'Content-Type': 'application/json' },
+            body: JSON.stringify({ action: 'set_voice_reply_mode', mode }),
+        })
+            .then(r => r.json())
+            .then(data => {
+                if (data && data.status === 'success') {
+                    _ttsReadyPromise = null;  // force re-probe on next bubble
+                }
+            })
+            .catch(() => {});
+    });
+}
+
+// Show/hide everything in the TTS card below the reply-mode dropdown.
+function _setTtsConfigVisible(host, visible) {
+    if (!host) return;
+    Array.from(host.children).forEach((child) => {
+        if (child.id === 'voice-reply-mode-wrap') return;
+        child.classList.toggle('hidden', !visible);
+    });
+}
+
+// Toggle wrapper visibility instead of re-rendering so dropdown state survives.
 function setCapabilityModelPickerVisible(def, visible, scope) {
    const root = scope || document;
    const wrap = root.querySelector(`#cap-${def.id}-model-wrap`);
@@ -4135,13 +4400,21 @@ function rebuildCapabilityModelDropdown(def, providerId, selectedModel, scope) {

    initDropdown(el, opts, initialValue, (value) => {
        const customWrap = document.getElementById(`cap-${def.id}-model-custom-wrap`);
-        if (!customWrap) return;
-        if (value === '__custom__') {
-            customWrap.classList.remove('hidden');
-            const input = document.getElementById(`cap-${def.id}-model-custom`);
-            if (input && !input.value) input.value = selectedModel || '';
-        } else {
-            customWrap.classList.add('hidden');
+        if (customWrap) {
+            if (value === '__custom__') {
+                customWrap.classList.remove('hidden');
+                const input = document.getElementById(`cap-${def.id}-model-custom`);
+                if (input && !input.value) input.value = selectedModel || '';
+            } else {
+                customWrap.classList.add('hidden');
+            }
+        }
+        // TTS voice catalog may be scoped per engine model (aggregating
+        // gateways). Rebuild the voice picker whenever the model changes.
+        if (def.id === 'tts') {
+            const provDd = document.getElementById('cap-tts-provider');
+            const provId = provDd ? getDropdownValue(provDd) : '';
+            rebuildCapabilityVoiceDropdown(provId, '', null, value);
        }
    });

@@ -4157,22 +4430,93 @@ function rebuildCapabilityModelDropdown(def, providerId, selectedModel, scope) {
    }
 }

+// TTS-only: rebuild the voice timbre picker against the provider's
+// curated voice list. Hidden when no provider is picked.
+//
+// Each voice entry may be:
+//   - a bare string  (code = label)
+//   - {value, label, hint?}   so we can show a friendly Chinese name
+//     while persisting the raw API code that the runtime sends.
+function rebuildCapabilityVoiceDropdown(providerId, selectedVoice, scope, modelId) {
+    const root = scope || document;
+    const wrap = root.querySelector(`#cap-tts-voice-wrap`);
+    const el = root.querySelector(`#cap-tts-voice`);
+    if (!wrap || !el) return;
+    const cap = modelsState.capabilities.tts || {};
+    const voicesByProvider = cap.provider_voices || {};
+    let raw = (providerId && voicesByProvider[providerId]) || [];
+    // Some providers (gateways) scope voices by engine model id.
+    if (raw && !Array.isArray(raw) && typeof raw === 'object') {
+        const activeModel = modelId
+            || (root.querySelector(`#cap-tts-model`) ? getDropdownValue(root.querySelector(`#cap-tts-model`)) : '');
+        raw = (activeModel && raw[activeModel]) || [];
+    }
+    if (!raw || raw.length === 0) {
+        wrap.classList.add('hidden');
+        return;
+    }
+    wrap.classList.remove('hidden');
+    // Voice picker: friendly name on the left, raw API code as right-hand
+    // hint. Persisted/sent value is always the raw code.
+    const codes = [];
+    const opts = raw.map(entry => {
+        if (typeof entry === 'string') {
+            codes.push(entry);
+            return { value: entry, label: entry };
+        }
+        codes.push(entry.value);
+        const code = entry.value;
+        const desc = entry.hint || entry.label || code;
+        return {
+            value: code,
+            label: desc,
+            hint: desc === code ? '' : code,
+        };
+    });
+    opts.push({ value: '__custom__', label: currentLang === 'zh' ? '自定义...' : 'Custom...' });
+
+    // Off-catalog values route through the custom branch.
+    let initial = selectedVoice || '';
+    const isCustom = initial && !codes.includes(initial);
+    if (isCustom) initial = '__custom__';
+    if (!initial) initial = codes[0];
+
+    initDropdown(el, opts, initial, (value) => {
+        const customWrap = root.querySelector(`#cap-tts-voice-custom-wrap`);
+        if (!customWrap) return;
+        if (value === '__custom__') {
+            customWrap.classList.remove('hidden');
+            const input = root.querySelector(`#cap-tts-voice-custom`);
+            if (input && !input.value) input.value = isCustom ? selectedVoice : '';
+        } else {
+            customWrap.classList.add('hidden');
+        }
+    });
+
+    const customWrap = root.querySelector(`#cap-tts-voice-custom-wrap`);
+    if (customWrap) {
+        if (initial === '__custom__') {
+            customWrap.classList.remove('hidden');
+            const input = root.querySelector(`#cap-tts-voice-custom`);
+            if (input) input.value = isCustom ? selectedVoice : '';
+        } else {
+            customWrap.classList.add('hidden');
+        }
+    }
+}
+
 function onCapabilityProviderChange(def, providerId, scope) {
    if (def.needsModel) {
-        // For capabilities that support `auto`, switching to the empty
-        // sentinel hides the model picker entirely so the card reads as
-        // "we'll figure it out"; switching back to a real vendor re-runs
-        // the rebuild against the capability-scoped model list.
+        // Empty sentinel hides the model picker (capability is in auto mode).
        const isAuto = providerId === '' && capabilitySupportsAuto(def.id);
        if (!isAuto) {
            rebuildCapabilityModelDropdown(def, providerId, '', scope);
        }
        setCapabilityModelPickerVisible(def, !isAuto, scope);
    }
-    // Refresh the auto-hint so it disappears once the user pins a vendor
-    // and reappears when they swing back to "auto". renderCapabilityHints
-    // now writes directly into the footer's hint slot, so we just call it
-    // again — no need to clean up stale DOM nodes.
+    if (def.id === 'tts') {
+        rebuildCapabilityVoiceDropdown(providerId, '', scope);
+    }
    const body = scope || document.querySelector(`[data-cap-body="${def.id}"]`);
    if (body) {
        const cap = modelsState.capabilities[def.id] || {};
@@ -4202,6 +4546,16 @@ function saveCapability(capId) {
    // the backend treats this as "fall back to the runtime chain".
    const isAuto = provider === '' && capabilitySupportsAuto(capId);
    const model = isAuto ? '' : getCapabilityModelValue(def);
+    // TTS carries an extra voice timbre (supports free-text custom ids).
+    let voice = '';
+    if (capId === 'tts' && !isAuto) {
+        const voiceDd = document.getElementById(`cap-${capId}-voice`);
+        voice = voiceDd ? getDropdownValue(voiceDd) : '';
+        if (voice === '__custom__') {
+            const input = document.getElementById(`cap-${capId}-voice-custom`);
+            voice = input ? input.value.trim() : '';
+        }
+    }

    // Embedding changes invalidate any pre-existing vector index because
    // dimensions / vendor differ. Gate the save behind a confirm, and on
@@ -4243,19 +4597,19 @@ function saveCapability(capId) {
            return;
        }
    }
-    _persistCapability(capId, provider, model);
+    _persistCapability(capId, provider, model, undefined, { voice });
 }

-function _persistCapability(capId, provider, model, onAfterSuccess) {
+function _persistCapability(capId, provider, model, onAfterSuccess, extras) {
+    const payload = { action: 'set_capability', capability: capId, provider_id: provider, model: model };
+    if (extras && extras.voice !== undefined) payload.voice = extras.voice;
    fetch('/api/models', {
        method: 'POST',
        headers: { 'Content-Type': 'application/json' },
-        body: JSON.stringify({ action: 'set_capability', capability: capId, provider_id: provider, model: model }),
+        body: JSON.stringify(payload),
    }).then(r => r.json()).then(data => {
        if (data.status === 'success') {
-            // Show "Saved" first, then refresh — loadModelsView would
-            // otherwise rebuild the card and wipe the status span before
-            // the user can register the confirmation.
+            // Flash "Saved" before reload so the status survives the rebuild.
            showStatus(`cap-${capId}-status`, 'models_save_success', false);
            setTimeout(() => {
                loadModelsView({ preserveScroll: true });
--- a/channel/web/web_channel.py
+++ b/channel/web/web_channel.py
@@ -6,6 +6,7 @@ import logging
 import mimetypes
 import os
 import random
+import shutil
 import threading
 import time
 import uuid
@@ -295,6 +296,12 @@ class WebChannel(ChatChannel):
                    "timestamp": time.time()
                })
                logger.debug(f"SSE done sent for request {request_id}")
+                # Auto-trigger TTS once the bot finishes its text reply. The
+                # synthesis runs in the background so the chat stream is never
+                # blocked; the resulting audio URL is pushed via a follow-up
+                # `voice_attach` SSE event and persisted to messages.extras.
+                if reply.type == ReplyType.TEXT and content.strip():
+                    self._maybe_dispatch_auto_tts(request_id, session_id, content, context)
                return

            # Fallback: polling mode
@@ -461,16 +468,133 @@ class WebChannel(ChatChannel):

        return on_event

+    # ------------------------------------------------------------------
+    # TTS auto-dispatch
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _resolve_voice_reply_mode() -> str:
+        """
+        Decide the TTS auto-reply policy.
+
+        Source of truth is the cross-channel pair
+        (`always_reply_voice`, `voice_reply_voice`) which chat_channel
+        also consults. The web UI presents these as a single three-state
+        picker (off / voice_if_voice / always) via a lossless mapping.
+        """
+        if conf().get("always_reply_voice", False):
+            return "always"
+        if conf().get("voice_reply_voice", False):
+            return "voice_if_voice"
+        return "off"
+
+    # Mirror of ModelsHandler._TTS_PROVIDERS. zhipu is intentionally omitted
+    # from the UI (GLM-TTS prelude beep); pinning it in config.json still works.
+    _TTS_PROVIDERS_SUGGEST_ORDER = ["openai", "minimax", "dashscope", "linkai"]
+
+    @classmethod
+    def _tts_provider_ready(cls) -> bool:
+        """True if user picked a provider OR any suggested vendor has an API key."""
+        if (conf().get("text_to_voice") or "").strip():
+            return True
+        for pid in cls._TTS_PROVIDERS_SUGGEST_ORDER:
+            meta = ConfigHandler.PROVIDER_MODELS.get(pid) or {}
+            key_field = meta.get("api_key_field")
+            if not key_field:
+                continue
+            val = (conf().get(key_field) or "").strip()
+            if val and val not in ("YOUR API KEY", "YOUR_API_KEY"):
+                return True
+        return False
+
+    def _maybe_dispatch_auto_tts(
+        self,
+        request_id: str,
+        session_id: str,
+        text: str,
+        context: dict,
+    ) -> None:
+        try:
+            mode = self._resolve_voice_reply_mode()
+            if mode == "off":
+                return
+            if mode == "voice_if_voice" and not context.get("is_voice_input"):
+                return
+            if not self._tts_provider_ready():
+                return
+            threading.Thread(
+                target=self._synthesize_tts_async,
+                args=(request_id, session_id, text),
+                daemon=True,
+            ).start()
+        except Exception as e:
+            logger.debug(f"[WebChannel] auto-tts dispatch skipped: {e}")
+
+    def _synthesize_tts_async(
+        self,
+        request_id: str,
+        session_id: str,
+        text: str,
+    ) -> None:
+        try:
+            from bridge.bridge import Bridge
+            reply = Bridge().fetch_text_to_voice(text)
+            if reply is None or reply.type != ReplyType.VOICE or not reply.content:
+                logger.warning(
+                    f"[WebChannel] TTS produced no audio for request {request_id}: "
+                    f"reply={reply}"
+                )
+                return
+            url = self._publish_tts_audio(reply.content)
+            if not url:
+                logger.warning(f"[WebChannel] TTS publish failed for request {request_id}")
+                return
+            payload = {"audio": {"url": url, "kind": "tts"}}
+            try:
+                from agent.memory import get_conversation_store
+                get_conversation_store().attach_extras_to_last_assistant(session_id, payload)
+            except Exception as e:
+                logger.debug(f"[WebChannel] tts persist skipped: {e}")
+            q = self.sse_queues.get(request_id)
+            if q is None:
+                logger.warning(
+                    f"[WebChannel] TTS ready but SSE queue already closed "
+                    f"for request {request_id} (url={url})"
+                )
+                return
+            q.put({
+                "type": "voice_attach",
+                "url": url,
+                "request_id": request_id,
+                "timestamp": time.time(),
+            })
+            logger.info(f"[WebChannel] TTS voice_attach pushed for request {request_id}: {url}")
+        except Exception as e:
+            # TTS failures are intentionally silent (no user-facing error).
+            logger.warning(f"[WebChannel] TTS synthesis failed: {e}")
+
+    @staticmethod
+    def _publish_tts_audio(src_path: str) -> str:
+        """Move a TTS file into uploads/ and return its public URL."""
+        try:
+            if not src_path or not os.path.isfile(src_path):
+                logger.warning(f"[WebChannel] publish_tts_audio missing source: {src_path!r}")
+                return ""
+            ext = os.path.splitext(src_path)[1].lower() or ".mp3"
+            upload_dir = _get_upload_dir()
+            os.makedirs(upload_dir, exist_ok=True)
+            ts = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
+            dst_name = f"voice_reply_{ts}_{random.randint(0, 9999)}{ext}"
+            dst_path = os.path.join(upload_dir, dst_name)
+            shutil.move(src_path, dst_path)
+            logger.debug(f"[WebChannel] publish_tts_audio moved {src_path} -> {dst_path}")
+            return f"/uploads/{dst_name}"
+        except Exception as e:
+            logger.warning(f"[WebChannel] publish_tts_audio failed: {e}")
+            return ""
+
    @staticmethod
    def _cleanup_stale_voice_recordings(max_age_seconds: int = 3600) -> None:
-        """Delete voice-input audio files older than `max_age_seconds`.
-
-        Called once at startup. Web mic recordings live in the upload
-        directory so the browser can replay them inside the conversation
-        bubble. We don't persist them to history, so once a process
-        restarts they're useless — but they're never auto-cleaned
-        anywhere else, so without this they accumulate over time.
-        """
+        """Drop voice_input_* uploads older than max_age_seconds (run at startup)."""
        try:
            upload_dir = _get_upload_dir()
            if not os.path.isdir(upload_dir):
@@ -619,6 +743,10 @@ class WebChannel(ChatChannel):
            prompt = json_data.get('message', '')
            use_sse = json_data.get('stream', True)
            attachments = json_data.get('attachments', [])
+            # Tag the message as originating from voice input so the post-reply
+            # TTS hook can honour the `voice_if_voice` policy (mirrors the
+            # desire_rtype concept used by other channels).
+            is_voice_input = bool(json_data.get('is_voice', False))

            # Append file references to the prompt (same format as QQ channel)
            if attachments:
@@ -669,6 +797,11 @@ class WebChannel(ChatChannel):
            context["session_id"] = session_id
            context["receiver"] = session_id
            context["request_id"] = request_id
+            if is_voice_input:
+                # Web channel runs its own TTS post-pipeline via
+                # _maybe_dispatch_auto_tts; don't set desire_rtype here or
+                # chat_channel would synthesize a duplicate VOICE reply.
+                context["is_voice_input"] = True

            if use_sse:
                context["on_event"] = self._make_sse_callback(request_id)
@@ -696,28 +829,40 @@ class WebChannel(ChatChannel):
        q = self.sse_queues[request_id]
        idle_timeout = 600  # 10 minutes without any real event
        deadline = time.time() + idle_timeout
-        done = False
+        # After the main reply is done we keep the stream open for a short
+        # tail so async post-processing (TTS auto-synthesis) can deliver a
+        # `voice_attach` event before the client disconnects.
+        POST_DONE_TAIL_SECONDS = 60
+        post_done = False
+        post_deadline = 0.0

        try:
            while time.time() < deadline:
                try:
                    item = q.get(timeout=1)
                except Empty:
+                    if post_done and time.time() >= post_deadline:
+                        break
                    yield b": keepalive\n\n"
                    continue

-                # Real event received, reset idle deadline
                deadline = time.time() + idle_timeout
-
                payload = json.dumps(item, ensure_ascii=False)
                yield f"data: {payload}\n\n".encode("utf-8")

-                if item.get("type") == "done":
-                    done = True
-                    break
+                itype = item.get("type")
+                if itype == "done":
+                    post_done = True
+                    post_deadline = time.time() + POST_DONE_TAIL_SECONDS
+                elif itype == "voice_attach":
+                    # WSGI buffers the previous chunk until the next yield;
+                    # shrink the tail so the generator wakes up quickly to
+                    # emit a couple of keepalive comments that push the
+                    # voice_attach payload through to the browser.
+                    post_done = True
+                    post_deadline = time.time() + 2  # 2s post-attach tail
        finally:
-            if done:
-                self.sse_queues.pop(request_id, None)
+            self.sse_queues.pop(request_id, None)

    def poll_response(self):
        """
@@ -811,6 +956,7 @@ class WebChannel(ChatChannel):
            '/uploads/(.*)', 'UploadsHandler',
            '/api/file', 'FileServeHandler',
            '/api/voice/asr', 'VoiceAsrHandler',
+            '/api/voice/tts', 'VoiceTtsHandler',
            '/poll', 'PollHandler',
            '/stream', 'StreamHandler',
            '/chat', 'ChatHandler',
@@ -936,15 +1082,8 @@ class UploadHandler:


 class VoiceAsrHandler:
-    """
-    Accept a short audio recording from the web console mic button,
-    save it under uploads/ so the browser can replay it, then run it
-    through the currently configured ASR provider.
-
-    Returns {status, text, audio_url} on success — the frontend renders
-    a voice-message bubble with the playable audio and the transcribed
-    caption.
-    """
+    """Receive a mic recording, persist it under uploads/ and run ASR.
+    Returns {status, text, audio_url} so the UI can render a playback bubble."""
    def POST(self):
        _require_auth()
        web.header('Content-Type', 'application/json; charset=utf-8')
@@ -997,6 +1136,48 @@ class VoiceAsrHandler:
            return json.dumps({"status": "error", "message": str(e)})


+class VoiceTtsHandler:
+    """On-demand TTS for the in-chat "read aloud" button. Returns the
+    audio URL and (when session_id is given) persists it onto the message."""
+    def POST(self):
+        _require_auth()
+        web.header('Content-Type', 'application/json; charset=utf-8')
+        try:
+            data = json.loads(web.data() or b"{}")
+            text = (data.get("text") or "").strip()
+            session_id = (data.get("session_id") or "").strip()
+            if not text:
+                return json.dumps({"status": "error", "message": "empty text"})
+            # `@singleton` makes WebChannel a factory function — go via instance.
+            channel = WebChannel()
+            if not channel._tts_provider_ready():
+                return json.dumps({"status": "error", "message": "tts not configured"})
+
+            from bridge.bridge import Bridge
+            reply = Bridge().fetch_text_to_voice(text)
+            if reply is None or reply.type != ReplyType.VOICE or not reply.content:
+                msg = getattr(reply, "content", "") or "tts failed"
+                return json.dumps({"status": "error", "message": str(msg)})
+
+            url = channel._publish_tts_audio(reply.content)
+            if not url:
+                return json.dumps({"status": "error", "message": "publish failed"})
+
+            if session_id:
+                try:
+                    from agent.memory import get_conversation_store
+                    get_conversation_store().attach_extras_to_last_assistant(
+                        session_id, {"audio": {"url": url, "kind": "tts"}},
+                    )
+                except Exception as e:
+                    logger.debug(f"[VoiceTtsHandler] persist skipped: {e}")
+
+            return json.dumps({"status": "success", "audio_url": url})
+        except Exception as e:
+            logger.exception(f"[VoiceTtsHandler] failed: {e}")
+            return json.dumps({"status": "error", "message": str(e)})
+
+
 class UploadsHandler:
    def GET(self, file_name):
        _require_auth()
@@ -1357,10 +1538,243 @@ class ModelsHandler:
    POST /api/models/capability -> set provider/model for a capability
    """

-    # Capability -> editable flag, current-value resolver, and supported provider
-    # ids drawn from ConfigHandler.PROVIDER_MODELS where applicable.
+    # Capability -> provider ids drawn from ConfigHandler.PROVIDER_MODELS.
    _ASR_PROVIDERS = ["openai", "dashscope", "zhipu", "linkai"]
-    _TTS_PROVIDERS = ["openai", "linkai", "minimax", "baidu", "ali", "xunfei", "azure", "google", "elevenlabs", "edge", "pytts"]
+    # Web-console white-list. Other vendors stay usable via direct config.
+    _TTS_PROVIDERS = ["openai", "minimax", "dashscope", "linkai"]
+
+    # TTS engine catalog (speech models, not voice timbres). Entries are
+    # either a bare code or {value, hint?} when a friendly label helps.
+    _TTS_PROVIDER_MODELS = {
+        "openai":    ["tts-1", "tts-1-hd", "gpt-4o-mini-tts"],
+        "minimax": [
+            {"value": "speech-2.8-hd",    "hint": "情绪渲染融合语气词,自然听感"},
+            {"value": "speech-2.8-turbo", "hint": "极致生成速度,更自然逼真"},
+            {"value": "speech-2.6-hd",    "hint": "超低延时,归一化升级"},
+            {"value": "speech-2.6-turbo", "hint": "更快更便宜,适合语音聊天/数字人"},
+        ],
+        "dashscope": [
+            {"value": "qwen3-tts-flash", "hint": "覆盖普通话、方言与主流外语"},
+        ],
+        # Aggregating gateway: a single endpoint multiplexes several
+        # underlying TTS engines, selected via the `model` field.
+        # Each engine exposes its own voice catalog (see _TTS_PROVIDER_VOICES).
+        "linkai": [
+            {"value": "tts-1",  "hint": "OpenAI · 多语种通用"},
+            {"value": "doubao", "hint": "字节豆包 · 中文音色丰富"},
+            {"value": "baidu",  "hint": "百度 · 中文主播音色"},
+        ],
+    }
+
+    # Per-provider voice timbres. Entries can be a bare code string
+    # (label = code) or {value, hint?} when a friendly secondary label
+    # helps recognition. We keep `value` as the raw API code so power
+    # users can cross-reference config.json.
+    _TTS_PROVIDER_VOICES = {
+        "openai":    [
+            "alloy", "echo", "fable", "onyx", "nova", "shimmer",
+            "ash", "ballad", "coral", "sage", "verse",
+        ],
+        "minimax": [
+            # Mandarin Chinese (full catalog)
+            {"value": "male-qn-qingse",                           "hint": "中文 · 青涩青年（男）"},
+            {"value": "male-qn-jingying",                         "hint": "中文 · 精英青年（男）"},
+            {"value": "male-qn-badao",                            "hint": "中文 · 霸道青年（男）"},
+            {"value": "male-qn-daxuesheng",                       "hint": "中文 · 青年大学生（男）"},
+            {"value": "female-shaonv",                            "hint": "中文 · 少女（女）"},
+            {"value": "female-yujie",                             "hint": "中文 · 御姐（女）"},
+            {"value": "female-chengshu",                          "hint": "中文 · 成熟女性（女）"},
+            {"value": "female-tianmei",                           "hint": "中文 · 甜美女性（女）"},
+            {"value": "male-qn-qingse-jingpin",                   "hint": "中文 · 青涩青年-beta（男）"},
+            {"value": "male-qn-jingying-jingpin",                 "hint": "中文 · 精英青年-beta（男）"},
+            {"value": "male-qn-badao-jingpin",                    "hint": "中文 · 霸道青年-beta（男）"},
+            {"value": "male-qn-daxuesheng-jingpin",               "hint": "中文 · 青年大学生-beta（男）"},
+            {"value": "female-shaonv-jingpin",                    "hint": "中文 · 少女-beta（女）"},
+            {"value": "female-yujie-jingpin",                     "hint": "中文 · 御姐-beta（女）"},
+            {"value": "female-chengshu-jingpin",                  "hint": "中文 · 成熟女性-beta（女）"},
+            {"value": "female-tianmei-jingpin",                   "hint": "中文 · 甜美女性-beta（女）"},
+            {"value": "clever_boy",                               "hint": "中文 · 聪明男童"},
+            {"value": "cute_boy",                                 "hint": "中文 · 可爱男童"},
+            {"value": "lovely_girl",                              "hint": "中文 · 萌萌女童"},
+            {"value": "cartoon_pig",                              "hint": "中文 · 卡通猪小琪"},
+            {"value": "bingjiao_didi",                            "hint": "中文 · 病娇弟弟"},
+            {"value": "junlang_nanyou",                           "hint": "中文 · 俊朗男友"},
+            {"value": "chunzhen_xuedi",                           "hint": "中文 · 纯真学弟"},
+            {"value": "lengdan_xiongzhang",                       "hint": "中文 · 冷淡学长"},
+            {"value": "badao_shaoye",                             "hint": "中文 · 霸道少爷"},
+            {"value": "tianxin_xiaoling",                         "hint": "中文 · 甜心小玲"},
+            {"value": "qiaopi_mengmei",                           "hint": "中文 · 俏皮萌妹"},
+            {"value": "wumei_yujie",                              "hint": "中文 · 妩媚御姐"},
+            {"value": "diadia_xuemei",                            "hint": "中文 · 嗲嗲学妹"},
+            {"value": "danya_xuejie",                             "hint": "中文 · 淡雅学姐"},
+            {"value": "Chinese (Mandarin)_Reliable_Executive",    "hint": "中文 · 沉稳高管"},
+            {"value": "Chinese (Mandarin)_News_Anchor",           "hint": "中文 · 新闻女声"},
+            {"value": "Chinese (Mandarin)_Mature_Woman",          "hint": "中文 · 傲娇御姐"},
+            {"value": "Chinese (Mandarin)_Unrestrained_Young_Man","hint": "中文 · 不羁青年"},
+            {"value": "Arrogant_Miss",                            "hint": "中文 · 嚣张小姐"},
+            {"value": "Robot_Armor",                              "hint": "中文 · 机械战甲"},
+            {"value": "Chinese (Mandarin)_Kind-hearted_Antie",    "hint": "中文 · 热心大婶"},
+            {"value": "Chinese (Mandarin)_HK_Flight_Attendant",   "hint": "中文 · 港普空姐"},
+            {"value": "Chinese (Mandarin)_Humorous_Elder",        "hint": "中文 · 搞笑大爷"},
+            {"value": "Chinese (Mandarin)_Gentleman",             "hint": "中文 · 温润男声"},
+            {"value": "Chinese (Mandarin)_Warm_Bestie",           "hint": "中文 · 温暖闺蜜"},
+            {"value": "Chinese (Mandarin)_Male_Announcer",        "hint": "中文 · 播报男声"},
+            {"value": "Chinese (Mandarin)_Sweet_Lady",            "hint": "中文 · 甜美女声"},
+            {"value": "Chinese (Mandarin)_Southern_Young_Man",    "hint": "中文 · 南方小哥"},
+            {"value": "Chinese (Mandarin)_Wise_Women",            "hint": "中文 · 阅历姐姐"},
+            {"value": "Chinese (Mandarin)_Gentle_Youth",          "hint": "中文 · 温润青年"},
+            {"value": "Chinese (Mandarin)_Warm_Girl",             "hint": "中文 · 温暖少女"},
+            {"value": "Chinese (Mandarin)_Kind-hearted_Elder",    "hint": "中文 · 花甲奶奶"},
+            {"value": "Chinese (Mandarin)_Cute_Spirit",           "hint": "中文 · 憨憨萌兽"},
+            {"value": "Chinese (Mandarin)_Radio_Host",            "hint": "中文 · 电台男主播"},
+            {"value": "Chinese (Mandarin)_Lyrical_Voice",         "hint": "中文 · 抒情男声"},
+            {"value": "Chinese (Mandarin)_Straightforward_Boy",   "hint": "中文 · 率真弟弟"},
+            {"value": "Chinese (Mandarin)_Sincere_Adult",         "hint": "中文 · 真诚青年"},
+            {"value": "Chinese (Mandarin)_Gentle_Senior",         "hint": "中文 · 温柔学姐"},
+            {"value": "Chinese (Mandarin)_Stubborn_Friend",       "hint": "中文 · 嘴硬竹马"},
+            {"value": "Chinese (Mandarin)_Crisp_Girl",            "hint": "中文 · 清脆少女"},
+            {"value": "Chinese (Mandarin)_Pure-hearted_Boy",      "hint": "中文 · 清澈邻家弟弟"},
+            {"value": "Chinese (Mandarin)_Soft_Girl",             "hint": "中文 · 柔和少女"},
+            # Cantonese (full catalog)
+            {"value": "Cantonese_ProfessionalHost（F)",            "hint": "粤语 · 专业女主持"},
+            {"value": "Cantonese_GentleLady",                     "hint": "粤语 · 温柔女声"},
+            {"value": "Cantonese_ProfessionalHost（M)",            "hint": "粤语 · 专业男主持"},
+            {"value": "Cantonese_PlayfulMan",                     "hint": "粤语 · 活泼男声"},
+            {"value": "Cantonese_CuteGirl",                       "hint": "粤语 · 可爱女孩"},
+            {"value": "Cantonese_KindWoman",                      "hint": "粤语 · 善良女声"},
+            # English (curated: 1F + 1M)
+            {"value": "English_Graceful_Lady",                    "hint": "英文 · Graceful Lady（女）"},
+            {"value": "English_Trustworthy_Man",                  "hint": "英文 · Trustworthy Man（男）"},
+            # Japanese (curated: 1F + 1M)
+            {"value": "Japanese_KindLady",                        "hint": "日文 · Kind Lady（女）"},
+            {"value": "Japanese_LoyalKnight",                     "hint": "日文 · Loyal Knight（男）"},
+            # Korean (curated: 1F + 1M)
+            {"value": "Korean_SweetGirl",                         "hint": "韩文 · Sweet Girl（女）"},
+            {"value": "Korean_CheerfulBoyfriend",                 "hint": "韩文 · Cheerful Boyfriend（男）"},
+        ],
+        "dashscope": [
+            {"value": "Cherry",   "hint": "芊悦 · 阳光女声"},
+            {"value": "Serena",   "hint": "苏瑶 · 温柔女声"},
+            {"value": "Chelsie",  "hint": "千雪 · 二次元少女"},
+            {"value": "Ethan",    "hint": "晨煦 · 阳光男声"},
+            {"value": "Moon",     "hint": "月白 · 率性男声"},
+            {"value": "Kai",      "hint": "凯 · 治愈男声"},
+            {"value": "Nofish",   "hint": "不吃鱼 · 设计师男声"},
+            {"value": "Bella",    "hint": "萌宝 · 小萝莉"},
+            {"value": "Bunny",    "hint": "萌小姬 · 萌系少女"},
+            {"value": "Stella",   "hint": "少女阿月 · 元气少女"},
+            {"value": "Neil",     "hint": "阿闻 · 新闻主播"},
+            {"value": "Seren",    "hint": "小婉 · 助眠女声"},
+            {"value": "Jada",     "hint": "上海话 · 阿珍"},
+            {"value": "Dylan",    "hint": "北京话 · 晓东"},
+            {"value": "Sunny",    "hint": "四川话 · 晴儿"},
+            {"value": "Eric",     "hint": "四川话 · 程川"},
+            {"value": "Rocky",    "hint": "粤语 · 阿强"},
+            {"value": "Kiki",     "hint": "粤语 · 阿清"},
+            {"value": "Peter",    "hint": "天津话 · 李彼得"},
+            {"value": "Marcus",   "hint": "陕西话 · 秦川"},
+            {"value": "Roy",      "hint": "闽南语 · 阿杰"},
+        ],
+        # Aggregating gateway: voices are scoped per engine model. The
+        # frontend picks the correct list based on the selected model so
+        # users don't see incompatible timbres for the active engine.
+        "linkai": {
+            "tts-1": [
+                "alloy", "echo", "fable", "onyx", "nova", "shimmer",
+            ],
+            "doubao": [
+                {"value": "zh_female_wanwanxiaohe_moon_bigtts",       "hint": "湾湾小何"},
+                {"value": "BV007_streaming",                          "hint": "亲切女声"},
+                {"value": "BV001_streaming",                          "hint": "通用女声"},
+                {"value": "BV002_streaming",                          "hint": "通用男声"},
+                {"value": "BV051_streaming",                          "hint": "奶气萌娃"},
+                {"value": "zh_female_linjianvhai_moon_bigtts",        "hint": "邻家女孩"},
+                {"value": "BV700_streaming",                          "hint": "灿灿"},
+                {"value": "BV019_streaming",                          "hint": "重庆小伙"},
+                {"value": "BV524_streaming",                          "hint": "日语男声"},
+                {"value": "BV021_streaming",                          "hint": "东北老铁"},
+                {"value": "BV701_streaming",                          "hint": "擎苍"},
+                {"value": "BV113_streaming",                          "hint": "甜宠少御"},
+                {"value": "BV056_streaming",                          "hint": "阳光男声"},
+                {"value": "BV213_streaming",                          "hint": "广西表哥"},
+                {"value": "BV119_streaming",                          "hint": "通用赘婿"},
+                {"value": "BV705_streaming",                          "hint": "炀炀"},
+                {"value": "BV033_streaming",                          "hint": "温柔小哥"},
+                {"value": "BV102_streaming",                          "hint": "儒雅青年"},
+                {"value": "BV522_streaming",                          "hint": "气质女生"},
+                {"value": "BV034_streaming",                          "hint": "知性姐姐 · 双语"},
+                {"value": "BV005_streaming",                          "hint": "活泼女声"},
+                {"value": "zh_female_wanqudashu_moon_bigtts",         "hint": "湾区大叔"},
+                {"value": "zh_female_daimengchuanmei_moon_bigtts",    "hint": "呆萌川妹"},
+                {"value": "zh_male_guozhoudege_moon_bigtts",          "hint": "广州德哥"},
+                {"value": "zh_male_beijingxiaoye_moon_bigtts",        "hint": "北京小爷"},
+                {"value": "zh_male_shaonianzixin_moon_bigtts",        "hint": "少年梓辛 / Brayan"},
+                {"value": "zh_female_meilinvyou_moon_bigtts",         "hint": "魅力女友"},
+                {"value": "zh_male_shenyeboke_moon_bigtts",           "hint": "深夜播客"},
+                {"value": "zh_female_sajiaonvyou_moon_bigtts",        "hint": "柔美女友"},
+                {"value": "zh_female_yuanqinvyou_moon_bigtts",        "hint": "撒娇学妹"},
+                {"value": "zh_male_haoyuxiaoge_moon_bigtts",          "hint": "浩宇小哥"},
+                {"value": "zh_male_guangxiyuanzhou_moon_bigtts",      "hint": "广西远舟"},
+                {"value": "zh_female_meituojieer_moon_bigtts",        "hint": "妹坨洁儿"},
+                {"value": "zh_male_yuzhouzixuan_moon_bigtts",         "hint": "豫州子轩"},
+                {"value": "BV115_streaming",                          "hint": "古风少御"},
+                {"value": "zh_female_gaolengyujie_moon_bigtts",       "hint": "高冷御姐"},
+                {"value": "zh_male_yuanboxiaoshu_moon_bigtts",        "hint": "渊博小叔"},
+                {"value": "zh_male_yangguangqingnian_moon_bigtts",    "hint": "阳光青年"},
+                {"value": "zh_male_aojiaobazong_moon_bigtts",         "hint": "傲娇霸总"},
+                {"value": "zh_male_jingqiangkanye_moon_bigtts",       "hint": "京腔侃爷 / Harmony"},
+                {"value": "zh_female_shuangkuaisisi_moon_bigtts",     "hint": "爽快思思 / Skye"},
+                {"value": "zh_male_wennuanahu_moon_bigtts",           "hint": "温暖阿虎 / Alvin"},
+                {"value": "multi_female_shuangkuaisisi_moon_bigtts",  "hint": "はるこ / Esmeralda"},
+                {"value": "multi_male_jingqiangkanye_moon_bigtts",    "hint": "かずね / Javier or Álvaro"},
+                {"value": "multi_female_gaolengyujie_moon_bigtts",    "hint": "あけみ"},
+                {"value": "multi_male_wanqudashu_moon_bigtts",        "hint": "ひろし / Roberto"},
+                {"value": "ICL_zh_female_bingruoshaonv_tob",          "hint": "病弱少女"},
+                {"value": "ICL_zh_female_huoponvhai_tob",             "hint": "活泼女孩"},
+                {"value": "ICL_zh_female_heainainai_tob",             "hint": "和蔼奶奶"},
+                {"value": "ICL_zh_female_linjuayi_tob",               "hint": "邻居阿姨"},
+                {"value": "zh_female_wenrouxiaoya_moon_bigtts",       "hint": "温柔小雅"},
+                {"value": "zh_female_tianmeixiaoyuan_moon_bigtts",    "hint": "甜美小源"},
+                {"value": "zh_female_qingchezizi_moon_bigtts",        "hint": "清澈梓梓"},
+                {"value": "zh_male_dongfanghaoran_moon_bigtts",       "hint": "东方浩然"},
+                {"value": "zh_male_jieshuoxiaoming_moon_bigtts",      "hint": "解说小明"},
+                {"value": "zh_female_kailangjiejie_moon_bigtts",      "hint": "开朗姐姐"},
+                {"value": "zh_male_linjiananhai_moon_bigtts",         "hint": "邻家男孩"},
+                {"value": "zh_female_tianmeiyueyue_moon_bigtts",      "hint": "甜美悦悦"},
+                {"value": "zh_female_xinlingjitang_moon_bigtts",      "hint": "心灵鸡汤"},
+            ],
+            "baidu": [
+                {"value": "baidu_0",    "hint": "度小美 · 标准女主播"},
+                {"value": "baidu_1",    "hint": "度小宇 · 亲切男声"},
+                {"value": "baidu_3",    "hint": "度逍遥 · 情感男声"},
+                {"value": "baidu_4",    "hint": "度丫丫 · 童声"},
+                {"value": "baidu_5",    "hint": "度小娇 · 成熟女主播"},
+                {"value": "baidu_5003", "hint": "度逍遥 · 情感男声"},
+                {"value": "baidu_5118", "hint": "度小鹿 · 甜美女声"},
+                {"value": "baidu_103",  "hint": "度米朵 · 可爱童声"},
+                {"value": "baidu_106",  "hint": "度博文 · 专业男主播"},
+                {"value": "baidu_110",  "hint": "度小童 · 童声主播"},
+                {"value": "baidu_111",  "hint": "度小萌 · 软萌妹子"},
+                {"value": "baidu_4003", "hint": "度逍遥 · 情感男声"},
+                {"value": "baidu_4100", "hint": "度小雯 · 活力女主播"},
+                {"value": "baidu_4103", "hint": "度米朵 · 可爱女声"},
+                {"value": "baidu_4105", "hint": "度灵儿 · 清澈女声"},
+                {"value": "baidu_4106", "hint": "度博文 · 专业男主播"},
+                {"value": "baidu_4115", "hint": "度小贤 · 电台男主播"},
+                {"value": "baidu_4117", "hint": "度小乔 · 活泼女声"},
+                {"value": "baidu_4119", "hint": "度小鹿 · 甜美女声"},
+                {"value": "baidu_4129", "hint": "度小彦 · 知识男主播"},
+                {"value": "baidu_4140", "hint": "度小新 · 专业女主播"},
+                {"value": "baidu_4143", "hint": "度清风 · 配音男声"},
+                {"value": "baidu_4144", "hint": "度姗姗 · 娱乐女声"},
+                {"value": "baidu_4149", "hint": "度星河 · 广告男声"},
+                {"value": "baidu_4206", "hint": "度博文 · 综艺男声"},
+                {"value": "baidu_4226", "hint": "南方 · 电台女主播"},
+                {"value": "baidu_4254", "hint": "度小清 · 广告女声"},
+                {"value": "baidu_4278", "hint": "度小贝 · 知识女主播"},
+            ],
+        },
+    }
    _EMBEDDING_PROVIDERS = ["openai", "dashscope", "doubao", "zhipu", "linkai"]

    # Capability-scoped model catalogs. The chat dropdown can reuse the
@@ -1525,7 +1939,7 @@ class ModelsHandler:
    @classmethod
    def _predict_vision_auto(cls, local_config: dict) -> dict:
        """Predict which provider vision.py will actually dispatch to when
-        no tool.vision.model is set. Mirrors the fallback order in
+        no tools.vision.model is set. Mirrors the fallback order in
        agent/tools/vision/vision.py::_resolve_providers so the UI hint
        matches reality."""
        chat = cls._chat_capability(local_config)
@@ -1590,12 +2004,12 @@ class ModelsHandler:

    @classmethod
    def _vision_capability(cls, local_config: dict) -> dict:
-        """Vision model. tool.vision.model is the explicit override; otherwise
+        """Vision model. tools.vision.model is the explicit override; otherwise
        the runtime fallback chain in agent/tools/vision/vision.py decides."""
-        tool_conf = local_config.get("tool") or {}
-        if not isinstance(tool_conf, dict):
-            tool_conf = {}
-        vision_conf = tool_conf.get("vision") or {}
+        tools_conf = local_config.get("tools") or local_config.get("tool") or {}
+        if not isinstance(tools_conf, dict):
+            tools_conf = {}
+        vision_conf = tools_conf.get("vision") or {}
        if not isinstance(vision_conf, dict):
            vision_conf = {}
        user_specified = (vision_conf.get("model") or "").strip()
@@ -1652,14 +2066,38 @@ class ModelsHandler:

    @classmethod
    def _tts_capability(cls, local_config: dict) -> dict:
-        provider_id = (local_config.get("text_to_voice") or "openai").strip().lower()
+        explicit = (local_config.get("text_to_voice") or "").strip().lower()
+        # Providers outside the white-list don't drive the picker, but their
+        # underlying runtime config is preserved so bridge still routes them.
+        ui_provider = explicit if explicit in cls._TTS_PROVIDERS else ""
+        suggested = ""
+        if not ui_provider:
+            for pid in cls._TTS_PROVIDERS:
+                meta = ConfigHandler.PROVIDER_MODELS.get(pid) or {}
+                key_field = meta.get("api_key_field")
+                if key_field and cls._is_real_key(local_config.get(key_field, "")):
+                    suggested = pid
+                    break
        return {
            "editable": True,
-            "current_provider": provider_id,
-            "current_model": local_config.get("text_to_voice_model", "") or "",
+            "current_provider": ui_provider,
+            "suggested_provider": suggested,
+            "current_model": (local_config.get("text_to_voice_model") or "") if ui_provider else "",
+            "current_voice": (local_config.get("tts_voice_id") or "") if ui_provider else "",
            "providers": cls._TTS_PROVIDERS,
+            "provider_models": cls._TTS_PROVIDER_MODELS,
+            "provider_voices": cls._TTS_PROVIDER_VOICES,
+            "reply_mode": cls._tts_reply_mode(local_config),
        }

+    @staticmethod
+    def _tts_reply_mode(local_config: dict) -> str:
+        if local_config.get("always_reply_voice", False):
+            return "always"
+        if local_config.get("voice_reply_voice", False):
+            return "voice_if_voice"
+        return "off"
+
    @classmethod
    def _embedding_capability(cls, local_config: dict) -> dict:
        # Embedding is "pick or empty" — runtime's legacy openai/linkai
@@ -1728,17 +2166,20 @@ class ModelsHandler:

    @classmethod
    def _image_capability(cls, local_config: dict) -> dict:
-        """Image generation. Source of truth: config["skill"]["image-generation"]["model"]
+        """Image generation. Source of truth: config["skills"]["image-generation"]["model"]
        (mirrors the per-skill config schema documented in skills/image-generation).
        The runtime resolver in skills/image-generation/scripts/generate.py
        reads this via the SKILL_IMAGE_GENERATION_MODEL env var that the
        agent_initializer syncs at startup; provider is inferred from the
        model name prefix, mirroring vision.py's design.
+
+        ``skill`` (singular) is still tolerated as a legacy fallback —
+        config.load_config() folds it into ``skills`` at startup.
        """
-        skill_node = local_config.get("skill") or {}
-        if not isinstance(skill_node, dict):
-            skill_node = {}
-        img_node = skill_node.get("image-generation") or {}
+        skills_node = local_config.get("skills") or local_config.get("skill") or {}
+        if not isinstance(skills_node, dict):
+            skills_node = {}
+        img_node = skills_node.get("image-generation") or {}
        if not isinstance(img_node, dict):
            img_node = {}
        explicit_model = (img_node.get("model") or "").strip()
@@ -1832,6 +2273,8 @@ class ModelsHandler:
                return self._handle_delete_provider(data)
            if action == "set_capability":
                return self._handle_set_capability(data)
+            if action == "set_voice_reply_mode":
+                return self._handle_set_voice_reply_mode(data)
            return json.dumps({"status": "error", "message": f"unknown action: {action!r}"})
        except Exception as e:
            logger.error(f"[ModelsHandler] POST failed: {e}")
@@ -1918,7 +2361,7 @@ class ModelsHandler:
        if capability == "asr":
            return self._set_simple("voice_to_text", provider_id)
        if capability == "tts":
-            return self._set_tts(provider_id, model)
+            return self._set_tts(provider_id, model, (data.get("voice") or "").strip())
        if capability == "embedding":
            return self._set_embedding(provider_id, model)
        if capability == "image":
@@ -1926,35 +2369,20 @@ class ModelsHandler:
        return json.dumps({"status": "error", "message": f"capability not editable: {capability}"})

    def _set_image(self, provider_id: str, model: str) -> str:
-        # Source of truth: config["skill"]["image-generation"]["model"].
-        # provider_id is informational only (used by the UI to highlight a
-        # vendor card); the runtime resolver infers the provider from the
-        # model name prefix at request time, mirroring vision.py's design.
-        # An empty model means "switch back to auto / let the script pick".
+        # Source of truth: skills.image-generation.model. provider_id is
+        # informational only; the resolver picks the vendor by model prefix.
        local_config = conf()
        file_cfg = self._read_file_config()

-        def _ensure_skill_node(cfg: dict) -> dict:
-            skill_node = cfg.get("skill") or {}
-            if not isinstance(skill_node, dict):
-                skill_node = {}
-            img_node = skill_node.get("image-generation") or {}
-            if not isinstance(img_node, dict):
-                img_node = {}
-            skill_node["image-generation"] = img_node
-            cfg["skill"] = skill_node
-            return img_node
-
-        _ensure_skill_node(local_config)["model"] = model or ""
-        _ensure_skill_node(file_cfg)["model"] = model or ""
+        self._set_nested_namespace_value(local_config, "skills", "image-generation", "model", model or "")
+        self._set_nested_namespace_value(file_cfg, "skills", "image-generation", "model", model or "")
+        self._drop_legacy_namespace(local_config, "skill", "skills", child="image-generation")
+        self._drop_legacy_namespace(file_cfg, "skill", "skills", child="image-generation")

        self._write_file_config(file_cfg)

-        # The skill subprocess (skills/image-generation/scripts/generate.py)
-        # reads SKILL_IMAGE_GENERATION_MODEL from its environment, which is
-        # only synced from config["skill"] at startup. Update os.environ live
-        # so changes take effect on the next call without a restart. An empty
-        # model means "clear the override" → drop the env var entirely.
+        # The skill subprocess reads SKILL_IMAGE_GENERATION_MODEL from env at
+        # startup; mirror the change so live edits apply without restart.
        env_key = "SKILL_IMAGE_GENERATION_MODEL"
        if model:
            os.environ[env_key] = model
@@ -1992,8 +2420,6 @@ class ModelsHandler:
            applied["model"] = model

        if not applied:
-            # No-op save (nothing to write). Return success so the UI can
-            # confirm the click without showing a misleading error.
            return json.dumps({"status": "success", "applied": {}, "noop": True})

        self._write_file_config(file_cfg)
@@ -2002,34 +2428,66 @@ class ModelsHandler:
        return json.dumps({"status": "success", "applied": applied})

    def _set_vision(self, provider_id: str, model: str) -> str:
-        # Vision uses tool.vision.model (nested). provider_id is informational
-        # only; the runtime resolver auto-routes by model name prefix.
+        # Source of truth: tools.vision.model. provider_id is informational
+        # only; the resolver picks the vendor by model prefix.
        local_config = conf()
        file_cfg = self._read_file_config()
-        tool_node = file_cfg.get("tool") or {}
-        if not isinstance(tool_node, dict):
-            tool_node = {}
-        vision_node = tool_node.get("vision") or {}
-        if not isinstance(vision_node, dict):
-            vision_node = {}
-        vision_node["model"] = model
-        tool_node["vision"] = vision_node
-        file_cfg["tool"] = tool_node
-        # Mirror into in-memory config so the live agent sees the change.
-        runtime_tool = local_config.get("tool") or {}
-        if not isinstance(runtime_tool, dict):
-            runtime_tool = {}
-        runtime_vision = runtime_tool.get("vision") or {}
-        if not isinstance(runtime_vision, dict):
-            runtime_vision = {}
-        runtime_vision["model"] = model
-        runtime_tool["vision"] = runtime_vision
-        local_config["tool"] = runtime_tool
+        self._set_nested_namespace_value(file_cfg, "tools", "vision", "model", model)
+        self._set_nested_namespace_value(local_config, "tools", "vision", "model", model)
+        self._drop_legacy_namespace(file_cfg, "tool", "tools", child="vision")
+        self._drop_legacy_namespace(local_config, "tool", "tools", child="vision")

        self._write_file_config(file_cfg)
        logger.info(f"[ModelsHandler] vision model set: {model!r}")
        return json.dumps({"status": "success", "model": model})

+    @staticmethod
+    def _set_nested_namespace_value(cfg, top: str, name: str, key: str, value):
+        """Set ``cfg[top][name][key] = value``, creating missing dicts."""
+        bucket = cfg.get(top)
+        if not isinstance(bucket, dict):
+            bucket = {}
+        node = bucket.get(name)
+        if not isinstance(node, dict):
+            node = {}
+        node[key] = value
+        bucket[name] = node
+        cfg[top] = bucket
+
+    @staticmethod
+    def _drop_legacy_namespace(cfg, legacy: str, canonical: str, child: str) -> None:
+        """Strip the deprecated singular key so config.json stays single-source."""
+        legacy_section = cfg.get(legacy)
+        if not isinstance(legacy_section, dict):
+            return
+        legacy_section.pop(child, None)
+        if legacy_section:
+            cfg[legacy] = legacy_section
+        else:
+            cfg.pop(legacy, None)
+
+    def _handle_set_voice_reply_mode(self, data: dict) -> str:
+        # UI picker (off / voice_if_voice / always) maps to the legacy
+        # always_reply_voice + voice_reply_voice pair that chat_channel.py
+        # reads, so all channels (web/feishu/wecom/...) share the routing.
+        mode = (data.get("mode") or "").strip().lower()
+        if mode not in ("off", "voice_if_voice", "always"):
+            return json.dumps({"status": "error", "message": f"invalid mode: {mode!r}"})
+        always = (mode == "always")
+        if_voice = (mode == "voice_if_voice")
+        local_config = conf()
+        file_cfg = self._read_file_config()
+        local_config["always_reply_voice"] = always
+        local_config["voice_reply_voice"] = if_voice
+        file_cfg["always_reply_voice"] = always
+        file_cfg["voice_reply_voice"] = if_voice
+        self._write_file_config(file_cfg)
+        logger.info(
+            f"[ModelsHandler] voice reply mode set: {mode!r} "
+            f"(always_reply_voice={always}, voice_reply_voice={if_voice})"
+        )
+        return json.dumps({"status": "success", "mode": mode})
+
    def _set_simple(self, key: str, value: str) -> str:
        local_config = conf()
        file_cfg = self._read_file_config()
@@ -2037,25 +2495,30 @@ class ModelsHandler:
        file_cfg[key] = value
        self._write_file_config(file_cfg)
        logger.info(f"[ModelsHandler] {key} set: {value!r}")
-        # Bridge caches voice_to_text routing + bot instance; refresh it
-        # so the change takes effect on the next voice request.
+        # Hot-swap the cached voice bot so the change takes effect immediately.
        if key in ("voice_to_text", "text_to_voice"):
            self._refresh_voice_routing()
        return json.dumps({"status": "success", key: value})

-    def _set_tts(self, provider_id: str, model: str) -> str:
+    def _set_tts(self, provider_id: str, model: str, voice: str = "") -> str:
        local_config = conf()
        file_cfg = self._read_file_config()
-        if provider_id:
-            local_config["text_to_voice"] = provider_id
-            file_cfg["text_to_voice"] = provider_id
-        if model:
-            local_config["text_to_voice_model"] = model
-            file_cfg["text_to_voice_model"] = model
+        local_config["text_to_voice"] = provider_id
+        file_cfg["text_to_voice"] = provider_id
+        local_config["text_to_voice_model"] = model
+        file_cfg["text_to_voice_model"] = model
+        local_config["tts_voice_id"] = voice
+        file_cfg["tts_voice_id"] = voice
        self._write_file_config(file_cfg)
-        logger.info(f"[ModelsHandler] tts updated: provider={provider_id!r} model={model!r}")
+        logger.info(
+            f"[ModelsHandler] tts updated: provider={provider_id!r} "
+            f"model={model!r} voice={voice!r}"
+        )
        self._refresh_voice_routing()
-        return json.dumps({"status": "success", "provider": provider_id, "model": model})
+        return json.dumps({
+            "status": "success",
+            "provider": provider_id, "model": model, "voice": voice,
+        })

    @staticmethod
    def _refresh_voice_routing() -> None:
@@ -2066,17 +2529,20 @@ class ModelsHandler:
            logger.warning(f"[ModelsHandler] Bridge voice refresh failed: {e}")

    def _set_embedding(self, provider_id: str, model: str) -> str:
-        # provider_id="" + model="" means "switch back to legacy auto mode".
+        # Two valid states: both empty (reset to pick-or-empty) OR both set.
+        # A provider without a model leaves the runtime in a broken half-state,
+        # so reject that explicitly instead of silently writing it through.
+        if provider_id and not model:
+            return json.dumps({
+                "status": "error",
+                "message": "embedding model is required when a provider is selected",
+            })
        local_config = conf()
        file_cfg = self._read_file_config()
        local_config["embedding_provider"] = provider_id
        file_cfg["embedding_provider"] = provider_id
-        if model:
-            local_config["embedding_model"] = model
-            file_cfg["embedding_model"] = model
-        else:
-            local_config["embedding_model"] = ""
-            file_cfg["embedding_model"] = ""
+        local_config["embedding_model"] = model
+        file_cfg["embedding_model"] = model
        self._write_file_config(file_cfg)
        logger.info(f"[ModelsHandler] embedding updated: provider={provider_id!r} model={model!r}")
        # The next /memory rebuild-index command hot-swaps the provider onto