feat(voice): add dashscope & zhipu ASR, in-page mic input

2026-07-17 11:07:11 +08:00 · 2026-05-20 22:36:37 +08:00
parent fff7326209
commit 2b90f377e6
9 changed files with 786 additions and 34 deletions
--- a/channel/web/chat.html
+++ b/channel/web/chat.html
@@ -422,15 +422,24 @@
                                    </button>
                                </div>
                                <div id="slash-menu" class="slash-menu hidden"></div>
-                                <textarea id="chat-input"
-                                          class="flex-1 min-w-0 px-4 py-[10px] rounded-xl border border-slate-200 dark:border-slate-600
-                                                 bg-slate-50 dark:bg-white/5 text-slate-800 dark:text-slate-100
-                                                 placeholder:text-slate-400 dark:placeholder:text-slate-500
-                                                 focus:outline-none focus:ring-0 focus:border-primary-600
-                                                 text-sm leading-relaxed"
-                                          rows="1"
-                                          data-i18n-placeholder="input_placeholder"
-                                          placeholder="输入消息，或输入 / 使用指令"></textarea>
+                                <div class="flex-1 min-w-0 relative flex items-center">
+                                    <textarea id="chat-input"
+                                              class="w-full pl-4 pr-11 py-[10px] rounded-xl border border-slate-200 dark:border-slate-600
+                                                     bg-slate-50 dark:bg-white/5 text-slate-800 dark:text-slate-100
+                                                     placeholder:text-slate-400 dark:placeholder:text-slate-500
+                                                     focus:outline-none focus:ring-0 focus:border-primary-600
+                                                     text-sm leading-relaxed"
+                                              rows="1"
+                                              data-i18n-placeholder="input_placeholder"
+                                              placeholder="输入消息，或输入 / 使用指令"></textarea>
+                                    <button id="mic-btn" type="button"
+                                            class="absolute right-2 top-1/2 -translate-y-1/2 w-8 h-8 flex items-center justify-center rounded-lg
+                                                   text-slate-400 hover:text-primary-500 hover:bg-primary-50 dark:hover:bg-primary-900/20
+                                                   cursor-pointer transition-colors duration-150"
+                                            data-i18n-title="mic_idle_title" title="点击录音 / 再按一次结束">
+                                        <i class="fas fa-microphone text-sm"></i>
+                                    </button>
+                                </div>
                                <button id="send-btn"
                                        class="flex-shrink-0 w-10 h-10 flex items-center justify-center rounded-lg
                                               bg-primary-400 text-white hover:bg-primary-500
--- a/channel/web/static/js/console.js
+++ b/channel/web/static/js/console.js
@@ -59,6 +59,7 @@ const I18N = {
        models_embedding_saved_title: '向量模型已更新',
        models_embedding_saved_msg: '请在聊天框输入 /memory rebuild-index 重建索引。',
        models_embedding_saved_ok: '去执行',
+        models_pick_provider: '待选择',
        models_clear_confirm_title: '清除厂商凭据',
        models_clear_confirm_msg: '确认清除该厂商的 API Key 与 Base URL 吗？相关能力将不再可用。',
        cancel: '取消',
@@ -153,6 +154,12 @@ const I18N = {
        tip_clear_context: '清除上下文',
        tip_attach: '添加附件',
        attach_menu_file: '上传文件',
+        mic_idle_title: '点击录音 / 再按一次结束',
+        mic_recording_title: '录音中，再次点击结束',
+        mic_busy_title: '识别中…',
+        mic_permission_denied: '无法访问麦克风，请检查浏览器权限',
+        mic_too_short: '录音太短，请重试',
+        mic_error: '语音识别失败',
        attach_menu_folder: '上传文件夹',
        confirm_yes: '确认',
        confirm_cancel: '取消',
@@ -207,6 +214,7 @@ const I18N = {
        models_embedding_saved_title: 'Embedding model updated',
        models_embedding_saved_msg: 'Send /memory rebuild-index in the chat to rebuild the index.',
        models_embedding_saved_ok: 'Go',
+        models_pick_provider: 'Pick a provider',
        models_clear_confirm_title: 'Clear vendor credentials',
        models_clear_confirm_msg: 'Remove this vendor\'s API Key and Base URL? Capabilities relying on it will stop working.',
        cancel: 'Cancel',
@@ -301,6 +309,12 @@ const I18N = {
        tip_clear_context: 'Clear Context',
        tip_attach: 'Add Attachment',
        attach_menu_file: 'Upload File',
+        mic_idle_title: 'Click to record, click again to stop',
+        mic_recording_title: 'Recording, click to stop',
+        mic_busy_title: 'Transcribing…',
+        mic_permission_denied: 'Cannot access microphone — check browser permissions',
+        mic_too_short: 'Recording too short, please retry',
+        mic_error: 'Speech recognition failed',
        attach_menu_folder: 'Upload Folder',
        confirm_yes: 'Confirm',
        confirm_cancel: 'Cancel',
@@ -707,6 +721,191 @@ if (!supportsDirectoryUpload && attachFolderOption) {
    attachFolderOption.classList.add('hidden');
 }

+// ---------------- Mic button: in-page voice input via the configured ASR provider ----------------
+(function setupMicButton() {
+    const micBtn = document.getElementById('mic-btn');
+    if (!micBtn) return;
+    if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia ||
+        typeof window.MediaRecorder === 'undefined') {
+        micBtn.style.display = 'none';
+        return;
+    }
+
+    let mediaRecorder = null;
+    let stream = null;
+    let chunks = [];
+    let recording = false;
+
+    const setIdle = () => {
+        recording = false;
+        micBtn.classList.remove('text-red-500', 'animate-pulse');
+        micBtn.classList.add('text-slate-400');
+        micBtn.querySelector('i').className = 'fas fa-microphone text-sm';
+        micBtn.title = t('mic_idle_title');
+    };
+    const setRecording = () => {
+        recording = true;
+        micBtn.classList.remove('text-slate-400');
+        micBtn.classList.add('text-red-500', 'animate-pulse');
+        micBtn.querySelector('i').className = 'fas fa-stop text-sm';
+        micBtn.title = t('mic_recording_title');
+    };
+    const setBusy = () => {
+        micBtn.classList.remove('text-red-500', 'animate-pulse', 'text-slate-400');
+        micBtn.classList.add('text-primary-500');
+        micBtn.querySelector('i').className = 'fas fa-spinner fa-spin text-sm';
+        micBtn.title = t('mic_busy_title');
+    };
+
+    const pickMimeType = () => {
+        const candidates = [
+            'audio/webm;codecs=opus',
+            'audio/webm',
+            'audio/ogg;codecs=opus',
+            'audio/mp4',
+        ];
+        for (const m of candidates) {
+            if (window.MediaRecorder.isTypeSupported && MediaRecorder.isTypeSupported(m)) {
+                return m;
+            }
+        }
+        return '';
+    };
+
+    const stopStream = () => {
+        if (stream) {
+            stream.getTracks().forEach(t => t.stop());
+            stream = null;
+        }
+    };
+
+    let _micTipTimer = null;
+    const flashError = (msg) => {
+        console.warn('[mic]', msg);
+        // Pop a small bubble above the mic so the user actually notices it.
+        // The mic lives inside a relatively-positioned wrapper around the
+        // textarea (see chat.html), so we hang the tip off that wrapper.
+        const wrapper = micBtn.parentElement;
+        if (!wrapper) return;
+        let tip = wrapper.querySelector('.mic-tip');
+        if (!tip) {
+            tip = document.createElement('div');
+            tip.className = 'mic-tip absolute right-1 bottom-full mb-2 px-2 py-1 rounded-md '
+                + 'text-xs text-white bg-slate-800/90 dark:bg-slate-700/90 shadow-md '
+                + 'pointer-events-none whitespace-nowrap z-10';
+            wrapper.appendChild(tip);
+        }
+        tip.textContent = msg;
+        tip.style.opacity = '1';
+        if (_micTipTimer) clearTimeout(_micTipTimer);
+        _micTipTimer = setTimeout(() => {
+            tip.style.opacity = '0';
+            tip.style.transition = 'opacity 200ms';
+            setTimeout(() => tip.remove(), 250);
+        }, 2000);
+    };
+
+    const upload = async (blob, ext) => {
+        setBusy();
+        const fd = new FormData();
+        fd.append('file', blob, `recording.${ext}`);
+        try {
+            const resp = await fetch('/api/voice/asr', { method: 'POST', body: fd });
+            const data = await resp.json();
+            if (data.status === 'success' && data.text) {
+                // Voice-message UX: drop the recording into the conversation
+                // as a playable bubble with the caption underneath, then
+                // dispatch the recognised text through the regular send path.
+                sendVoiceMessage(data.text, data.audio_url);
+            } else {
+                flashError(data.message || t('mic_error'));
+            }
+        } catch (e) {
+            flashError(t('mic_error') + ': ' + e.message);
+        } finally {
+            setIdle();
+        }
+    };
+
+    const start = async () => {
+        try {
+            stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+        } catch (e) {
+            flashError(t('mic_permission_denied'));
+            return;
+        }
+        chunks = [];
+        const mimeType = pickMimeType();
+        try {
+            mediaRecorder = mimeType
+                ? new MediaRecorder(stream, { mimeType })
+                : new MediaRecorder(stream);
+        } catch (e) {
+            stopStream();
+            flashError(t('mic_error') + ': ' + e.message);
+            return;
+        }
+        mediaRecorder.ondataavailable = (ev) => {
+            if (ev.data && ev.data.size > 0) chunks.push(ev.data);
+        };
+        mediaRecorder.onstop = () => {
+            stopStream();
+            const blob = new Blob(chunks, { type: mediaRecorder.mimeType || 'audio/webm' });
+            // Map mime -> extension so the server picks the right file suffix.
+            const mt = (mediaRecorder.mimeType || 'audio/webm').split(';')[0];
+            const extMap = {
+                'audio/webm': 'webm', 'audio/ogg': 'ogg',
+                'audio/mp4': 'm4a',   'audio/mpeg': 'mp3',
+            };
+            const ext = extMap[mt] || 'webm';
+            // 256 bytes ~ container header only, no actual audio. Anything
+            // below that we treat as "tapped by mistake".
+            if (blob.size < 256) {
+                setIdle();
+                flashError(t('mic_too_short'));
+                return;
+            }
+            upload(blob, ext);
+        };
+        // timeslice=250ms: force the recorder to flush a chunk every 250ms.
+        // Without it some browsers wait for stop() before producing any data,
+        // which loses the audio on very short taps.
+        mediaRecorder.start(250);
+        recordStartedAt = Date.now();
+        setRecording();
+    };
+
+    let recordStartedAt = 0;
+
+    const stopWithMinDuration = () => {
+        const elapsed = Date.now() - recordStartedAt;
+        const minMs = 350;
+        if (elapsed < minMs) {
+            // Give the recorder a moment to capture at least one chunk
+            // before we tell it to stop.
+            setTimeout(() => stop(), minMs - elapsed);
+        } else {
+            stop();
+        }
+    };
+
+    const stop = () => {
+        if (mediaRecorder && mediaRecorder.state !== 'inactive') {
+            mediaRecorder.stop();
+        }
+    };
+
+    micBtn.addEventListener('click', () => {
+        if (recording) {
+            stopWithMinDuration();
+        } else {
+            start();
+        }
+    });
+
+    setIdle();
+})();
+
 // Smart auto-scroll: pause when user scrolls up, resume when near bottom
 let _autoScrollEnabled = true;
 const _SCROLL_THRESHOLD = 80; // px from bottom to re-enable auto-scroll
@@ -1250,6 +1449,87 @@ document.querySelectorAll('.example-card').forEach(card => {
    });
 });

+// Voice-message variant of sendMessage(): renders a playable audio bubble
+// with the ASR caption, then dispatches the recognised text to /message
+// through the same SSE/loading flow as a typed message.
+function sendVoiceMessage(text, audioUrl) {
+    text = (text || '').trim();
+    if (!text) return;
+
+    inputHistory.push(text);
+    historyIdx = -1;
+    historySavedDraft = '';
+
+    const ws = document.getElementById('welcome-screen');
+    const isFirstMessage = !!ws;
+    if (ws) ws.remove();
+
+    const titleInfo = isFirstMessage ? { sid: sessionId, userMsg: text } : null;
+    const timestamp = new Date();
+    addUserVoiceMessage(audioUrl, text, timestamp);
+    const loadingEl = addLoadingIndicator();
+
+    const body = {
+        session_id: sessionId,
+        message: text,
+        stream: true,
+        timestamp: timestamp.toISOString(),
+    };
+
+    const MAX_RETRIES = 2;
+    const RETRY_DELAY_MS = 1000;
+    function postWithRetry(attempt) {
+        fetch('/message', {
+            method: 'POST',
+            headers: { 'Content-Type': 'application/json' },
+            body: JSON.stringify(body)
+        })
+        .then(r => r.json())
+        .then(data => {
+            if (data.status === 'success') {
+                if (data.stream) {
+                    startSSE(data.request_id, loadingEl, timestamp, titleInfo);
+                } else {
+                    loadingContainers[data.request_id] = loadingEl;
+                }
+            } else {
+                loadingEl.remove();
+                addBotMessage(t('error_send'), new Date());
+            }
+        })
+        .catch(err => {
+            if (attempt < MAX_RETRIES) {
+                setTimeout(() => postWithRetry(attempt + 1), RETRY_DELAY_MS * (attempt + 1));
+                return;
+            }
+            loadingEl.remove();
+            addBotMessage(t('error_send'), new Date());
+        });
+    }
+    postWithRetry(0);
+}
+
+function addUserVoiceMessage(audioUrl, caption, timestamp) {
+    const el = document.createElement('div');
+    el.className = 'flex justify-end px-4 sm:px-6 py-3';
+    // Voice-message bubble: playable <audio> on top, ASR caption beneath.
+    // The bubble keeps the same primary tint as a normal user message so
+    // it visually slots into the conversation flow.
+    el.innerHTML = `
+        <div class="max-w-[75%] sm:max-w-[60%]">
+            <div class="bg-slate-100 dark:bg-white/10 text-slate-700 dark:text-slate-200 rounded-2xl px-3 py-2 msg-content user-bubble">
+                <audio controls preload="metadata" src="${audioUrl}"
+                       class="block w-[260px] max-w-full h-9"></audio>
+                ${caption ? `<div class="text-xs mt-1.5 leading-snug text-slate-500 dark:text-slate-400 whitespace-pre-wrap break-words">${escapeHtml(caption)}</div>` : ''}
+            </div>
+            <div class="text-xs text-slate-400 dark:text-slate-500 mt-1.5 text-right">${formatTime(timestamp)}</div>
+        </div>
+    `;
+    messagesDiv.appendChild(el);
+    _autoScrollEnabled = true;
+    scrollChatToBottom(true);
+}
+
 function sendMessage() {
    const text = chatInput.value.trim();
    if (!text && pendingAttachments.length === 0) return;
@@ -2573,7 +2853,12 @@ let cfgProviderValue = '';
 let cfgModelValue = '';

 // --- Custom dropdown helper ---
-function initDropdown(el, options, selectedValue, onChange) {
+function initDropdown(el, options, selectedValue, onChange, opts) {
+    // opts.placeholder: when set AND selectedValue is empty, render that text
+    // in a dim style instead of auto-selecting options[0]. Useful for
+    // "pick or empty" capabilities (asr / embedding) where we want the
+    // user to make an explicit choice.
+    opts = opts || {};
    const textEl = el.querySelector('.cfg-dropdown-text');
    const menuEl = el.querySelector('.cfg-dropdown-menu');
    const selEl = el.querySelector('.cfg-dropdown-selected');
@@ -2615,8 +2900,20 @@ function initDropdown(el, options, selectedValue, onChange) {
            menuEl.appendChild(item);
        });
        const sel = options.find(o => o.value === el._ddValue);
-        textEl.textContent = sel ? sel.label : (options[0] ? options[0].label : '--');
-        if (!sel && options[0]) el._ddValue = options[0].value;
+        if (sel) {
+            textEl.textContent = sel.label;
+            textEl.classList.remove('text-slate-400', 'dark:text-slate-500');
+        } else if (opts.placeholder && !el._ddValue) {
+            // No selection yet — show the placeholder in muted style.
+            // Do NOT write a fallback value, so the dropdown stays
+            // "unsaved" until the user explicitly picks.
+            textEl.textContent = opts.placeholder;
+            textEl.classList.add('text-slate-400', 'dark:text-slate-500');
+        } else {
+            textEl.textContent = options[0] ? options[0].label : '--';
+            textEl.classList.remove('text-slate-400', 'dark:text-slate-500');
+            if (options[0]) el._ddValue = options[0].value;
+        }
    }

    render();
@@ -3566,21 +3863,27 @@ function renderCapabilityBody(def, cap, body) {
    // For auto-capable capabilities, an "auto" strategy means the user has
    // not pinned a vendor; we honor that by selecting the empty-string
    // sentinel rather than the resolved fallback provider name.
-    // `suggested_provider` is a UI-only preselect for embedding when nothing
-    // is pinned yet — purely cosmetic, not persisted until the user saves.
+    // `suggested_provider` is a UI-only preselect (used by embedding & ASR)
+    // when the user has not pinned a vendor yet — purely cosmetic, not
+    // persisted until the user clicks Save.
+    // For "pick or empty" capabilities (no current, no suggestion), we leave
+    // the dropdown unselected and show a muted placeholder so the user is
+    // nudged to pick explicitly.
+    const noSelectionAndNoHint = !cap.current_provider && !cap.suggested_provider;
    const initialProviderValue = pendingProvider
        ? pendingProvider
        : ((cap.strategy === 'auto' && capabilitySupportsAuto(def.id))
            ? ''
            : (cap.current_provider
                || cap.suggested_provider
-                || (ddOpts[0] && ddOpts[0].value)
+                || (noSelectionAndNoHint ? '' : (ddOpts[0] && ddOpts[0].value))
                || ''));
    initDropdown(
        provDd,
        ddOpts,
        initialProviderValue,
-        (value) => onCapabilityProviderChange(def, value, body)
+        (value) => onCapabilityProviderChange(def, value, body),
+        noSelectionAndNoHint ? { placeholder: t('models_pick_provider') } : null
    );
    decorateCapabilityProviderDropdown(def, provDd, providerOpts);

--- a/channel/web/web_channel.py
+++ b/channel/web/web_channel.py
@@ -1,10 +1,11 @@
+import datetime
 import hashlib
 import hmac
-import time
 import json
 import logging
 import mimetypes
 import os
+import random
 import threading
 import time
 import uuid
@@ -340,6 +341,10 @@ class WebChannel(ChatChannel):
        # Use a single-element list as a mutable counter accessible from closure.
        reasoning_chars_sent = [0]
        reasoning_capped_notified = [False]
+        # Captures the first error message emitted by agent_stream so the
+        # subsequent agent_end handler can skip its "empty final_response"
+        # fallback (which would otherwise overwrite the real error).
+        streamed_error: List[str] = []

        def on_event(event: dict):
            if request_id not in self.sse_queues:
@@ -398,6 +403,25 @@ class WebChannel(ChatChannel):
                if tool_calls:
                    q.put({"type": "message_end", "has_tool_calls": True})

+            elif event_type == "error":
+                # Agent raised an exception (LLM 401/timeout/etc). Surface the
+                # real message instead of letting the empty-response fallback
+                # below hide it as "(模型未返回任何内容)".
+                err_msg = data.get("error") or "unknown error"
+                logger.warning(
+                    f"[WebChannel] agent_stream emitted error for "
+                    f"request {request_id}: {err_msg}"
+                )
+                # Remember it so the agent_end handler below knows not to
+                # rewrite the message into a generic empty-response notice.
+                streamed_error.append(err_msg)
+                q.put({
+                    "type": "done",
+                    "content": f"❌ {err_msg}",
+                    "request_id": request_id,
+                    "timestamp": time.time(),
+                })
+
            elif event_type == "agent_end":
                # Safety net: if the agent finishes with an empty final_response,
                # chat_channel skips _send_reply (because reply.content is empty),
@@ -406,16 +430,21 @@ class WebChannel(ChatChannel):
                # here so the frontend always gets closure.
                final_response = data.get("final_response", "")
                if not final_response or not str(final_response).strip():
-                    logger.warning(
-                        f"[WebChannel] agent_end with empty final_response for "
-                        f"request {request_id}, sending fallback done"
-                    )
-                    q.put({
-                        "type": "done",
-                        "content": "(模型未返回任何内容，请重试或换一种方式描述你的需求)",
-                        "request_id": request_id,
-                        "timestamp": time.time(),
-                    })
+                    if streamed_error:
+                        # Error was already surfaced via the `error` event
+                        # handler above; nothing more to do here.
+                        pass
+                    else:
+                        logger.warning(
+                            f"[WebChannel] agent_end with empty final_response for "
+                            f"request {request_id}, sending fallback done"
+                        )
+                        q.put({
+                            "type": "done",
+                            "content": "(模型未返回任何内容，请重试或换一种方式描述你的需求)",
+                            "request_id": request_id,
+                            "timestamp": time.time(),
+                        })

            elif event_type == "file_to_send":
                file_path = data.get("path", "")
@@ -432,6 +461,39 @@ class WebChannel(ChatChannel):

        return on_event

+    @staticmethod
+    def _cleanup_stale_voice_recordings(max_age_seconds: int = 3600) -> None:
+        """Delete voice-input audio files older than `max_age_seconds`.
+
+        Called once at startup. Web mic recordings live in the upload
+        directory so the browser can replay them inside the conversation
+        bubble. We don't persist them to history, so once a process
+        restarts they're useless — but they're never auto-cleaned
+        anywhere else, so without this they accumulate over time.
+        """
+        try:
+            upload_dir = _get_upload_dir()
+            if not os.path.isdir(upload_dir):
+                return
+            now = time.time()
+            removed = 0
+            for name in os.listdir(upload_dir):
+                if not name.startswith("voice_input_"):
+                    continue
+                full = os.path.join(upload_dir, name)
+                try:
+                    if not os.path.isfile(full):
+                        continue
+                    if now - os.path.getmtime(full) > max_age_seconds:
+                        os.remove(full)
+                        removed += 1
+                except OSError:
+                    continue
+            if removed:
+                logger.info(f"[WebChannel] cleaned up {removed} stale voice recording(s) from {upload_dir}")
+        except Exception as e:
+            logger.warning(f"[WebChannel] voice cleanup failed: {e}")
+
    def upload_file(self):
        """Handle file or directory upload via multipart/form-data."""
        try:
@@ -703,6 +765,8 @@ class WebChannel(ChatChannel):
        port = conf().get("web_port", 9899)
        is_public_bind = host in ("0.0.0.0", "::")

+        self._cleanup_stale_voice_recordings()
+
        # 打印可用渠道类型提示
        logger.info(
            "[WebChannel] 全部可用通道如下，可修改 config.json 配置文件中的 channel_type 字段进行切换，多个通道用逗号分隔：")
@@ -746,6 +810,7 @@ class WebChannel(ChatChannel):
            '/upload', 'UploadHandler',
            '/uploads/(.*)', 'UploadsHandler',
            '/api/file', 'FileServeHandler',
+            '/api/voice/asr', 'VoiceAsrHandler',
            '/poll', 'PollHandler',
            '/stream', 'StreamHandler',
            '/chat', 'ChatHandler',
@@ -870,6 +935,68 @@ class UploadHandler:
        return WebChannel().upload_file()


+class VoiceAsrHandler:
+    """
+    Accept a short audio recording from the web console mic button,
+    save it under uploads/ so the browser can replay it, then run it
+    through the currently configured ASR provider.
+
+    Returns {status, text, audio_url} on success — the frontend renders
+    a voice-message bubble with the playable audio and the transcribed
+    caption.
+    """
+    def POST(self):
+        _require_auth()
+        web.header('Content-Type', 'application/json; charset=utf-8')
+
+        saved_path = None
+        try:
+            params = _raw_web_input()
+            file_obj = params.get("file")
+            if file_obj is None:
+                return json.dumps({"status": "error", "message": "no audio file"})
+
+            filename = getattr(file_obj, "filename", "") or "recording.webm"
+            ext = os.path.splitext(filename)[1].lower() or ".webm"
+            if ext not in (".webm", ".ogg", ".opus", ".mp4", ".m4a", ".mp3", ".wav"):
+                ext = ".webm"
+
+            upload_dir = _get_upload_dir()
+            os.makedirs(upload_dir, exist_ok=True)
+            ts = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
+            saved_name = f"voice_input_{ts}_{random.randint(0, 9999)}{ext}"
+            saved_path = os.path.join(upload_dir, saved_name)
+            with open(saved_path, "wb") as f:
+                f.write(file_obj.file.read() if hasattr(file_obj, "file") else file_obj.value)
+
+            audio_url = f"/uploads/{saved_name}"
+
+            from bridge.bridge import Bridge
+            reply = Bridge().fetch_voice_to_text(saved_path)
+            if reply is None:
+                return json.dumps({
+                    "status": "error",
+                    "message": "ASR returned no reply",
+                    "audio_url": audio_url,
+                })
+
+            from bridge.reply import ReplyType
+            if reply.type == ReplyType.TEXT:
+                return json.dumps({
+                    "status": "success",
+                    "text": reply.content or "",
+                    "audio_url": audio_url,
+                })
+            return json.dumps({
+                "status": "error",
+                "message": reply.content or "ASR failed",
+                "audio_url": audio_url,
+            })
+        except Exception as e:
+            logger.exception(f"[VoiceAsrHandler] failed: {e}")
+            return json.dumps({"status": "error", "message": str(e)})
+
+
 class UploadsHandler:
    def GET(self, file_name):
        _require_auth()
@@ -1232,7 +1359,7 @@ class ModelsHandler:

    # Capability -> editable flag, current-value resolver, and supported provider
    # ids drawn from ConfigHandler.PROVIDER_MODELS where applicable.
-    _ASR_PROVIDERS = ["openai", "linkai", "baidu", "ali", "xunfei", "azure", "google"]
+    _ASR_PROVIDERS = ["openai", "dashscope", "zhipu", "linkai"]
    _TTS_PROVIDERS = ["openai", "linkai", "minimax", "baidu", "ali", "xunfei", "azure", "google", "elevenlabs", "edge", "pytts"]
    _EMBEDDING_PROVIDERS = ["openai", "dashscope", "doubao", "zhipu", "linkai"]

@@ -1502,10 +1629,23 @@ class ModelsHandler:

    @classmethod
    def _asr_capability(cls, local_config: dict) -> dict:
-        provider_id = (local_config.get("voice_to_text") or "openai").strip().lower()
+        # "Pick or empty" — when voice_to_text is unset we don't show a
+        # current selection. `suggested_provider` previews which vendor
+        # the bridge auto-picker would land on (purely a UX hint, NOT
+        # persisted). Once the user saves a vendor, we lock onto it.
+        explicit = (local_config.get("voice_to_text") or "").strip().lower()
+        suggested = ""
+        if not explicit:
+            for pid in cls._ASR_PROVIDERS:
+                meta = ConfigHandler.PROVIDER_MODELS.get(pid) or {}
+                key_field = meta.get("api_key_field")
+                if key_field and cls._is_real_key(local_config.get(key_field, "")):
+                    suggested = pid
+                    break
        return {
            "editable": True,
-            "current_provider": provider_id,
+            "current_provider": explicit,
+            "suggested_provider": suggested,
            "current_model": "",
            "providers": cls._ASR_PROVIDERS,
        }
@@ -1897,6 +2037,10 @@ class ModelsHandler:
        file_cfg[key] = value
        self._write_file_config(file_cfg)
        logger.info(f"[ModelsHandler] {key} set: {value!r}")
+        # Bridge caches voice_to_text routing + bot instance; refresh it
+        # so the change takes effect on the next voice request.
+        if key in ("voice_to_text", "text_to_voice"):
+            self._refresh_voice_routing()
        return json.dumps({"status": "success", key: value})

    def _set_tts(self, provider_id: str, model: str) -> str:
@@ -1910,8 +2054,17 @@ class ModelsHandler:
            file_cfg["text_to_voice_model"] = model
        self._write_file_config(file_cfg)
        logger.info(f"[ModelsHandler] tts updated: provider={provider_id!r} model={model!r}")
+        self._refresh_voice_routing()
        return json.dumps({"status": "success", "provider": provider_id, "model": model})

+    @staticmethod
+    def _refresh_voice_routing() -> None:
+        try:
+            from bridge.bridge import Bridge
+            Bridge().refresh_voice()
+        except Exception as e:
+            logger.warning(f"[ModelsHandler] Bridge voice refresh failed: {e}")
+
    def _set_embedding(self, provider_id: str, model: str) -> str:
        # provider_id="" + model="" means "switch back to legacy auto mode".
        local_config = conf()
@@ -1926,9 +2079,9 @@ class ModelsHandler:
            file_cfg["embedding_model"] = ""
        self._write_file_config(file_cfg)
        logger.info(f"[ModelsHandler] embedding updated: provider={provider_id!r} model={model!r}")
-        # The agent's MemoryManager picks the new provider on next process
-        # restart; the index dim may now mismatch so a rebuild is needed.
-        # The frontend surfaces this via a confirm + post-save dialog.
+        # The next /memory rebuild-index command hot-swaps the provider onto
+        # the running MemoryManager (see plugins/cow_cli). The dim may have
+        # changed, so the frontend prompts the user to rebuild.
        return json.dumps({"status": "success", "provider": provider_id, "model": model})

    @staticmethod