feat(voice): add dashscope & zhipu ASR, in-page mic input

2026-07-20 05:27:59 +08:00 · 2026-05-20 22:36:37 +08:00
parent fff7326209
commit 2b90f377e6
9 changed files with 786 additions and 34 deletions
--- a/bridge/bridge.py
+++ b/bridge/bridge.py
@@ -14,7 +14,9 @@ class Bridge(object):
    def __init__(self):
        self.btype = {
            "chat": const.OPENAI,
-            "voice_to_text": conf().get("voice_to_text", "openai"),
+            # Empty `voice_to_text` (the default in new configs) triggers
+            # the auto-pick below — see _auto_pick_voice_to_text for order.
+            "voice_to_text": conf().get("voice_to_text") or self._auto_pick_voice_to_text(),
            "text_to_voice": conf().get("text_to_voice", "google"),
            "translate": conf().get("translate", "baidu"),
        }
@@ -84,6 +86,46 @@ class Bridge(object):
        self.chat_bots = {}
        self._agent_bridge = None

+    def refresh_voice(self):
+        """Re-read voice_to_text / text_to_voice from config and drop the
+        cached voice bots so the next call picks up the new provider.
+        Used by the web console after the user edits voice settings.
+        Does NOT touch the agent_bridge / agent state.
+        """
+        new_v2t = conf().get("voice_to_text") or self._auto_pick_voice_to_text()
+        new_t2v = conf().get("text_to_voice", "google")
+        if conf().get("use_linkai") and conf().get("linkai_api_key"):
+            if not conf().get("voice_to_text") or conf().get("voice_to_text") in ["openai"]:
+                new_v2t = const.LINKAI
+            if not conf().get("text_to_voice") or conf().get("text_to_voice") in ["openai", const.TTS_1, const.TTS_1_HD]:
+                new_t2v = const.LINKAI
+        self.btype["voice_to_text"] = new_v2t
+        self.btype["text_to_voice"] = new_t2v
+        self.bots.pop("voice_to_text", None)
+        self.bots.pop("text_to_voice", None)
+        logger.info(f"[Bridge] voice refreshed: voice_to_text={new_v2t}, text_to_voice={new_t2v}")
+
+    @staticmethod
+    def _auto_pick_voice_to_text() -> str:
+        """Pick an ASR provider by configured api keys when voice_to_text is
+        unset. Order matches the web console: openai → dashscope → zhipu →
+        linkai. Falls back to 'openai' when nothing is configured so the
+        original "missing key" error is preserved.
+        """
+        def has(k: str) -> bool:
+            v = (conf().get(k) or "").strip()
+            return v != "" and v not in ("YOUR API KEY", "YOUR_API_KEY")
+
+        for key, provider in (
+            ("open_ai_api_key", "openai"),
+            ("dashscope_api_key", "dashscope"),
+            ("zhipu_ai_api_key", "zhipu"),
+            ("linkai_api_key", "linkai"),
+        ):
+            if has(key):
+                return provider
+        return "openai"
+
    # 模型对应的接口
    def get_bot(self, typename):
        if self.bots.get(typename) is None:
--- a/channel/web/chat.html
+++ b/channel/web/chat.html
@@ -422,8 +422,9 @@
                                    </button>
                                </div>
                                <div id="slash-menu" class="slash-menu hidden"></div>
+                                <div class="flex-1 min-w-0 relative flex items-center">
                                    <textarea id="chat-input"
-                                          class="flex-1 min-w-0 px-4 py-[10px] rounded-xl border border-slate-200 dark:border-slate-600
+                                              class="w-full pl-4 pr-11 py-[10px] rounded-xl border border-slate-200 dark:border-slate-600
                                                     bg-slate-50 dark:bg-white/5 text-slate-800 dark:text-slate-100
                                                     placeholder:text-slate-400 dark:placeholder:text-slate-500
                                                     focus:outline-none focus:ring-0 focus:border-primary-600
@@ -431,6 +432,14 @@
                                              rows="1"
                                              data-i18n-placeholder="input_placeholder"
                                              placeholder="输入消息，或输入 / 使用指令"></textarea>
+                                    <button id="mic-btn" type="button"
+                                            class="absolute right-2 top-1/2 -translate-y-1/2 w-8 h-8 flex items-center justify-center rounded-lg
+                                                   text-slate-400 hover:text-primary-500 hover:bg-primary-50 dark:hover:bg-primary-900/20
+                                                   cursor-pointer transition-colors duration-150"
+                                            data-i18n-title="mic_idle_title" title="点击录音 / 再按一次结束">
+                                        <i class="fas fa-microphone text-sm"></i>
+                                    </button>
+                                </div>
                                <button id="send-btn"
                                        class="flex-shrink-0 w-10 h-10 flex items-center justify-center rounded-lg
                                               bg-primary-400 text-white hover:bg-primary-500
--- a/channel/web/static/js/console.js
+++ b/channel/web/static/js/console.js
@@ -59,6 +59,7 @@ const I18N = {
        models_embedding_saved_title: '向量模型已更新',
        models_embedding_saved_msg: '请在聊天框输入 /memory rebuild-index 重建索引。',
        models_embedding_saved_ok: '去执行',
+        models_pick_provider: '待选择',
        models_clear_confirm_title: '清除厂商凭据',
        models_clear_confirm_msg: '确认清除该厂商的 API Key 与 Base URL 吗？相关能力将不再可用。',
        cancel: '取消',
@@ -153,6 +154,12 @@ const I18N = {
        tip_clear_context: '清除上下文',
        tip_attach: '添加附件',
        attach_menu_file: '上传文件',
+        mic_idle_title: '点击录音 / 再按一次结束',
+        mic_recording_title: '录音中，再次点击结束',
+        mic_busy_title: '识别中…',
+        mic_permission_denied: '无法访问麦克风，请检查浏览器权限',
+        mic_too_short: '录音太短，请重试',
+        mic_error: '语音识别失败',
        attach_menu_folder: '上传文件夹',
        confirm_yes: '确认',
        confirm_cancel: '取消',
@@ -207,6 +214,7 @@ const I18N = {
        models_embedding_saved_title: 'Embedding model updated',
        models_embedding_saved_msg: 'Send /memory rebuild-index in the chat to rebuild the index.',
        models_embedding_saved_ok: 'Go',
+        models_pick_provider: 'Pick a provider',
        models_clear_confirm_title: 'Clear vendor credentials',
        models_clear_confirm_msg: 'Remove this vendor\'s API Key and Base URL? Capabilities relying on it will stop working.',
        cancel: 'Cancel',
@@ -301,6 +309,12 @@ const I18N = {
        tip_clear_context: 'Clear Context',
        tip_attach: 'Add Attachment',
        attach_menu_file: 'Upload File',
+        mic_idle_title: 'Click to record, click again to stop',
+        mic_recording_title: 'Recording, click to stop',
+        mic_busy_title: 'Transcribing…',
+        mic_permission_denied: 'Cannot access microphone — check browser permissions',
+        mic_too_short: 'Recording too short, please retry',
+        mic_error: 'Speech recognition failed',
        attach_menu_folder: 'Upload Folder',
        confirm_yes: 'Confirm',
        confirm_cancel: 'Cancel',
@@ -707,6 +721,191 @@ if (!supportsDirectoryUpload && attachFolderOption) {
    attachFolderOption.classList.add('hidden');
 }

+// ---------------- Mic button: in-page voice input via the configured ASR provider ----------------
+(function setupMicButton() {
+    const micBtn = document.getElementById('mic-btn');
+    if (!micBtn) return;
+    if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia ||
+        typeof window.MediaRecorder === 'undefined') {
+        micBtn.style.display = 'none';
+        return;
+    }
+
+    let mediaRecorder = null;
+    let stream = null;
+    let chunks = [];
+    let recording = false;
+
+    const setIdle = () => {
+        recording = false;
+        micBtn.classList.remove('text-red-500', 'animate-pulse');
+        micBtn.classList.add('text-slate-400');
+        micBtn.querySelector('i').className = 'fas fa-microphone text-sm';
+        micBtn.title = t('mic_idle_title');
+    };
+    const setRecording = () => {
+        recording = true;
+        micBtn.classList.remove('text-slate-400');
+        micBtn.classList.add('text-red-500', 'animate-pulse');
+        micBtn.querySelector('i').className = 'fas fa-stop text-sm';
+        micBtn.title = t('mic_recording_title');
+    };
+    const setBusy = () => {
+        micBtn.classList.remove('text-red-500', 'animate-pulse', 'text-slate-400');
+        micBtn.classList.add('text-primary-500');
+        micBtn.querySelector('i').className = 'fas fa-spinner fa-spin text-sm';
+        micBtn.title = t('mic_busy_title');
+    };
+
+    const pickMimeType = () => {
+        const candidates = [
+            'audio/webm;codecs=opus',
+            'audio/webm',
+            'audio/ogg;codecs=opus',
+            'audio/mp4',
+        ];
+        for (const m of candidates) {
+            if (window.MediaRecorder.isTypeSupported && MediaRecorder.isTypeSupported(m)) {
+                return m;
+            }
+        }
+        return '';
+    };
+
+    const stopStream = () => {
+        if (stream) {
+            stream.getTracks().forEach(t => t.stop());
+            stream = null;
+        }
+    };
+
+    let _micTipTimer = null;
+    const flashError = (msg) => {
+        console.warn('[mic]', msg);
+        // Pop a small bubble above the mic so the user actually notices it.
+        // The mic lives inside a relatively-positioned wrapper around the
+        // textarea (see chat.html), so we hang the tip off that wrapper.
+        const wrapper = micBtn.parentElement;
+        if (!wrapper) return;
+        let tip = wrapper.querySelector('.mic-tip');
+        if (!tip) {
+            tip = document.createElement('div');
+            tip.className = 'mic-tip absolute right-1 bottom-full mb-2 px-2 py-1 rounded-md '
+                + 'text-xs text-white bg-slate-800/90 dark:bg-slate-700/90 shadow-md '
+                + 'pointer-events-none whitespace-nowrap z-10';
+            wrapper.appendChild(tip);
+        }
+        tip.textContent = msg;
+        tip.style.opacity = '1';
+        if (_micTipTimer) clearTimeout(_micTipTimer);
+        _micTipTimer = setTimeout(() => {
+            tip.style.opacity = '0';
+            tip.style.transition = 'opacity 200ms';
+            setTimeout(() => tip.remove(), 250);
+        }, 2000);
+    };
+
+    const upload = async (blob, ext) => {
+        setBusy();
+        const fd = new FormData();
+        fd.append('file', blob, `recording.${ext}`);
+        try {
+            const resp = await fetch('/api/voice/asr', { method: 'POST', body: fd });
+            const data = await resp.json();
+            if (data.status === 'success' && data.text) {
+                // Voice-message UX: drop the recording into the conversation
+                // as a playable bubble with the caption underneath, then
+                // dispatch the recognised text through the regular send path.
+                sendVoiceMessage(data.text, data.audio_url);
+            } else {
+                flashError(data.message || t('mic_error'));
+            }
+        } catch (e) {
+            flashError(t('mic_error') + ': ' + e.message);
+        } finally {
+            setIdle();
+        }
+    };
+
+    const start = async () => {
+        try {
+            stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+        } catch (e) {
+            flashError(t('mic_permission_denied'));
+            return;
+        }
+        chunks = [];
+        const mimeType = pickMimeType();
+        try {
+            mediaRecorder = mimeType
+                ? new MediaRecorder(stream, { mimeType })
+                : new MediaRecorder(stream);
+        } catch (e) {
+            stopStream();
+            flashError(t('mic_error') + ': ' + e.message);
+            return;
+        }
+        mediaRecorder.ondataavailable = (ev) => {
+            if (ev.data && ev.data.size > 0) chunks.push(ev.data);
+        };
+        mediaRecorder.onstop = () => {
+            stopStream();
+            const blob = new Blob(chunks, { type: mediaRecorder.mimeType || 'audio/webm' });
+            // Map mime -> extension so the server picks the right file suffix.
+            const mt = (mediaRecorder.mimeType || 'audio/webm').split(';')[0];
+            const extMap = {
+                'audio/webm': 'webm', 'audio/ogg': 'ogg',
+                'audio/mp4': 'm4a',   'audio/mpeg': 'mp3',
+            };
+            const ext = extMap[mt] || 'webm';
+            // 256 bytes ~ container header only, no actual audio. Anything
+            // below that we treat as "tapped by mistake".
+            if (blob.size < 256) {
+                setIdle();
+                flashError(t('mic_too_short'));
+                return;
+            }
+            upload(blob, ext);
+        };
+        // timeslice=250ms: force the recorder to flush a chunk every 250ms.
+        // Without it some browsers wait for stop() before producing any data,
+        // which loses the audio on very short taps.
+        mediaRecorder.start(250);
+        recordStartedAt = Date.now();
+        setRecording();
+    };
+
+    let recordStartedAt = 0;
+
+    const stopWithMinDuration = () => {
+        const elapsed = Date.now() - recordStartedAt;
+        const minMs = 350;
+        if (elapsed < minMs) {
+            // Give the recorder a moment to capture at least one chunk
+            // before we tell it to stop.
+            setTimeout(() => stop(), minMs - elapsed);
+        } else {
+            stop();
+        }
+    };
+
+    const stop = () => {
+        if (mediaRecorder && mediaRecorder.state !== 'inactive') {
+            mediaRecorder.stop();
+        }
+    };
+
+    micBtn.addEventListener('click', () => {
+        if (recording) {
+            stopWithMinDuration();
+        } else {
+            start();
+        }
+    });
+
+    setIdle();
+})();
+
 // Smart auto-scroll: pause when user scrolls up, resume when near bottom
 let _autoScrollEnabled = true;
 const _SCROLL_THRESHOLD = 80; // px from bottom to re-enable auto-scroll
@@ -1250,6 +1449,87 @@ document.querySelectorAll('.example-card').forEach(card => {
    });
 });

+// Voice-message variant of sendMessage(): renders a playable audio bubble
+// with the ASR caption, then dispatches the recognised text to /message
+// through the same SSE/loading flow as a typed message.
+function sendVoiceMessage(text, audioUrl) {
+    text = (text || '').trim();
+    if (!text) return;
+
+    inputHistory.push(text);
+    historyIdx = -1;
+    historySavedDraft = '';
+
+    const ws = document.getElementById('welcome-screen');
+    const isFirstMessage = !!ws;
+    if (ws) ws.remove();
+
+    const titleInfo = isFirstMessage ? { sid: sessionId, userMsg: text } : null;
+    const timestamp = new Date();
+    addUserVoiceMessage(audioUrl, text, timestamp);
+    const loadingEl = addLoadingIndicator();
+
+    const body = {
+        session_id: sessionId,
+        message: text,
+        stream: true,
+        timestamp: timestamp.toISOString(),
+    };
+
+    const MAX_RETRIES = 2;
+    const RETRY_DELAY_MS = 1000;
+    function postWithRetry(attempt) {
+        fetch('/message', {
+            method: 'POST',
+            headers: { 'Content-Type': 'application/json' },
+            body: JSON.stringify(body)
+        })
+        .then(r => r.json())
+        .then(data => {
+            if (data.status === 'success') {
+                if (data.stream) {
+                    startSSE(data.request_id, loadingEl, timestamp, titleInfo);
+                } else {
+                    loadingContainers[data.request_id] = loadingEl;
+                }
+            } else {
+                loadingEl.remove();
+                addBotMessage(t('error_send'), new Date());
+            }
+        })
+        .catch(err => {
+            if (attempt < MAX_RETRIES) {
+                setTimeout(() => postWithRetry(attempt + 1), RETRY_DELAY_MS * (attempt + 1));
+                return;
+            }
+            loadingEl.remove();
+            addBotMessage(t('error_send'), new Date());
+        });
+    }
+    postWithRetry(0);
+}
+
+function addUserVoiceMessage(audioUrl, caption, timestamp) {
+    const el = document.createElement('div');
+    el.className = 'flex justify-end px-4 sm:px-6 py-3';
+    // Voice-message bubble: playable <audio> on top, ASR caption beneath.
+    // The bubble keeps the same primary tint as a normal user message so
+    // it visually slots into the conversation flow.
+    el.innerHTML = `
+        <div class="max-w-[75%] sm:max-w-[60%]">
+            <div class="bg-slate-100 dark:bg-white/10 text-slate-700 dark:text-slate-200 rounded-2xl px-3 py-2 msg-content user-bubble">
+                <audio controls preload="metadata" src="${audioUrl}"
+                       class="block w-[260px] max-w-full h-9"></audio>
+                ${caption ? `<div class="text-xs mt-1.5 leading-snug text-slate-500 dark:text-slate-400 whitespace-pre-wrap break-words">${escapeHtml(caption)}</div>` : ''}
+            </div>
+            <div class="text-xs text-slate-400 dark:text-slate-500 mt-1.5 text-right">${formatTime(timestamp)}</div>
+        </div>
+    `;
+    messagesDiv.appendChild(el);
+    _autoScrollEnabled = true;
+    scrollChatToBottom(true);
+}
+
 function sendMessage() {
    const text = chatInput.value.trim();
    if (!text && pendingAttachments.length === 0) return;
@@ -2573,7 +2853,12 @@ let cfgProviderValue = '';
 let cfgModelValue = '';

 // --- Custom dropdown helper ---
-function initDropdown(el, options, selectedValue, onChange) {
+function initDropdown(el, options, selectedValue, onChange, opts) {
+    // opts.placeholder: when set AND selectedValue is empty, render that text
+    // in a dim style instead of auto-selecting options[0]. Useful for
+    // "pick or empty" capabilities (asr / embedding) where we want the
+    // user to make an explicit choice.
+    opts = opts || {};
    const textEl = el.querySelector('.cfg-dropdown-text');
    const menuEl = el.querySelector('.cfg-dropdown-menu');
    const selEl = el.querySelector('.cfg-dropdown-selected');
@@ -2615,8 +2900,20 @@ function initDropdown(el, options, selectedValue, onChange) {
            menuEl.appendChild(item);
        });
        const sel = options.find(o => o.value === el._ddValue);
-        textEl.textContent = sel ? sel.label : (options[0] ? options[0].label : '--');
-        if (!sel && options[0]) el._ddValue = options[0].value;
+        if (sel) {
+            textEl.textContent = sel.label;
+            textEl.classList.remove('text-slate-400', 'dark:text-slate-500');
+        } else if (opts.placeholder && !el._ddValue) {
+            // No selection yet — show the placeholder in muted style.
+            // Do NOT write a fallback value, so the dropdown stays
+            // "unsaved" until the user explicitly picks.
+            textEl.textContent = opts.placeholder;
+            textEl.classList.add('text-slate-400', 'dark:text-slate-500');
+        } else {
+            textEl.textContent = options[0] ? options[0].label : '--';
+            textEl.classList.remove('text-slate-400', 'dark:text-slate-500');
+            if (options[0]) el._ddValue = options[0].value;
+        }
    }

    render();
@@ -3566,21 +3863,27 @@ function renderCapabilityBody(def, cap, body) {
    // For auto-capable capabilities, an "auto" strategy means the user has
    // not pinned a vendor; we honor that by selecting the empty-string
    // sentinel rather than the resolved fallback provider name.
-    // `suggested_provider` is a UI-only preselect for embedding when nothing
-    // is pinned yet — purely cosmetic, not persisted until the user saves.
+    // `suggested_provider` is a UI-only preselect (used by embedding & ASR)
+    // when the user has not pinned a vendor yet — purely cosmetic, not
+    // persisted until the user clicks Save.
+    // For "pick or empty" capabilities (no current, no suggestion), we leave
+    // the dropdown unselected and show a muted placeholder so the user is
+    // nudged to pick explicitly.
+    const noSelectionAndNoHint = !cap.current_provider && !cap.suggested_provider;
    const initialProviderValue = pendingProvider
        ? pendingProvider
        : ((cap.strategy === 'auto' && capabilitySupportsAuto(def.id))
            ? ''
            : (cap.current_provider
                || cap.suggested_provider
-                || (ddOpts[0] && ddOpts[0].value)
+                || (noSelectionAndNoHint ? '' : (ddOpts[0] && ddOpts[0].value))
                || ''));
    initDropdown(
        provDd,
        ddOpts,
        initialProviderValue,
-        (value) => onCapabilityProviderChange(def, value, body)
+        (value) => onCapabilityProviderChange(def, value, body),
+        noSelectionAndNoHint ? { placeholder: t('models_pick_provider') } : null
    );
    decorateCapabilityProviderDropdown(def, provDd, providerOpts);

--- a/channel/web/web_channel.py
+++ b/channel/web/web_channel.py
@@ -1,10 +1,11 @@
+import datetime
 import hashlib
 import hmac
-import time
 import json
 import logging
 import mimetypes
 import os
+import random
 import threading
 import time
 import uuid
@@ -340,6 +341,10 @@ class WebChannel(ChatChannel):
        # Use a single-element list as a mutable counter accessible from closure.
        reasoning_chars_sent = [0]
        reasoning_capped_notified = [False]
+        # Captures the first error message emitted by agent_stream so the
+        # subsequent agent_end handler can skip its "empty final_response"
+        # fallback (which would otherwise overwrite the real error).
+        streamed_error: List[str] = []

        def on_event(event: dict):
            if request_id not in self.sse_queues:
@@ -398,6 +403,25 @@ class WebChannel(ChatChannel):
                if tool_calls:
                    q.put({"type": "message_end", "has_tool_calls": True})

+            elif event_type == "error":
+                # Agent raised an exception (LLM 401/timeout/etc). Surface the
+                # real message instead of letting the empty-response fallback
+                # below hide it as "(模型未返回任何内容)".
+                err_msg = data.get("error") or "unknown error"
+                logger.warning(
+                    f"[WebChannel] agent_stream emitted error for "
+                    f"request {request_id}: {err_msg}"
+                )
+                # Remember it so the agent_end handler below knows not to
+                # rewrite the message into a generic empty-response notice.
+                streamed_error.append(err_msg)
+                q.put({
+                    "type": "done",
+                    "content": f"❌ {err_msg}",
+                    "request_id": request_id,
+                    "timestamp": time.time(),
+                })
+
            elif event_type == "agent_end":
                # Safety net: if the agent finishes with an empty final_response,
                # chat_channel skips _send_reply (because reply.content is empty),
@@ -406,6 +430,11 @@ class WebChannel(ChatChannel):
                # here so the frontend always gets closure.
                final_response = data.get("final_response", "")
                if not final_response or not str(final_response).strip():
+                    if streamed_error:
+                        # Error was already surfaced via the `error` event
+                        # handler above; nothing more to do here.
+                        pass
+                    else:
                        logger.warning(
                            f"[WebChannel] agent_end with empty final_response for "
                            f"request {request_id}, sending fallback done"
@@ -432,6 +461,39 @@ class WebChannel(ChatChannel):

        return on_event

+    @staticmethod
+    def _cleanup_stale_voice_recordings(max_age_seconds: int = 3600) -> None:
+        """Delete voice-input audio files older than `max_age_seconds`.
+
+        Called once at startup. Web mic recordings live in the upload
+        directory so the browser can replay them inside the conversation
+        bubble. We don't persist them to history, so once a process
+        restarts they're useless — but they're never auto-cleaned
+        anywhere else, so without this they accumulate over time.
+        """
+        try:
+            upload_dir = _get_upload_dir()
+            if not os.path.isdir(upload_dir):
+                return
+            now = time.time()
+            removed = 0
+            for name in os.listdir(upload_dir):
+                if not name.startswith("voice_input_"):
+                    continue
+                full = os.path.join(upload_dir, name)
+                try:
+                    if not os.path.isfile(full):
+                        continue
+                    if now - os.path.getmtime(full) > max_age_seconds:
+                        os.remove(full)
+                        removed += 1
+                except OSError:
+                    continue
+            if removed:
+                logger.info(f"[WebChannel] cleaned up {removed} stale voice recording(s) from {upload_dir}")
+        except Exception as e:
+            logger.warning(f"[WebChannel] voice cleanup failed: {e}")
+
    def upload_file(self):
        """Handle file or directory upload via multipart/form-data."""
        try:
@@ -703,6 +765,8 @@ class WebChannel(ChatChannel):
        port = conf().get("web_port", 9899)
        is_public_bind = host in ("0.0.0.0", "::")

+        self._cleanup_stale_voice_recordings()
+
        # 打印可用渠道类型提示
        logger.info(
            "[WebChannel] 全部可用通道如下，可修改 config.json 配置文件中的 channel_type 字段进行切换，多个通道用逗号分隔：")
@@ -746,6 +810,7 @@ class WebChannel(ChatChannel):
            '/upload', 'UploadHandler',
            '/uploads/(.*)', 'UploadsHandler',
            '/api/file', 'FileServeHandler',
+            '/api/voice/asr', 'VoiceAsrHandler',
            '/poll', 'PollHandler',
            '/stream', 'StreamHandler',
            '/chat', 'ChatHandler',
@@ -870,6 +935,68 @@ class UploadHandler:
        return WebChannel().upload_file()


+class VoiceAsrHandler:
+    """
+    Accept a short audio recording from the web console mic button,
+    save it under uploads/ so the browser can replay it, then run it
+    through the currently configured ASR provider.
+
+    Returns {status, text, audio_url} on success — the frontend renders
+    a voice-message bubble with the playable audio and the transcribed
+    caption.
+    """
+    def POST(self):
+        _require_auth()
+        web.header('Content-Type', 'application/json; charset=utf-8')
+
+        saved_path = None
+        try:
+            params = _raw_web_input()
+            file_obj = params.get("file")
+            if file_obj is None:
+                return json.dumps({"status": "error", "message": "no audio file"})
+
+            filename = getattr(file_obj, "filename", "") or "recording.webm"
+            ext = os.path.splitext(filename)[1].lower() or ".webm"
+            if ext not in (".webm", ".ogg", ".opus", ".mp4", ".m4a", ".mp3", ".wav"):
+                ext = ".webm"
+
+            upload_dir = _get_upload_dir()
+            os.makedirs(upload_dir, exist_ok=True)
+            ts = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
+            saved_name = f"voice_input_{ts}_{random.randint(0, 9999)}{ext}"
+            saved_path = os.path.join(upload_dir, saved_name)
+            with open(saved_path, "wb") as f:
+                f.write(file_obj.file.read() if hasattr(file_obj, "file") else file_obj.value)
+
+            audio_url = f"/uploads/{saved_name}"
+
+            from bridge.bridge import Bridge
+            reply = Bridge().fetch_voice_to_text(saved_path)
+            if reply is None:
+                return json.dumps({
+                    "status": "error",
+                    "message": "ASR returned no reply",
+                    "audio_url": audio_url,
+                })
+
+            from bridge.reply import ReplyType
+            if reply.type == ReplyType.TEXT:
+                return json.dumps({
+                    "status": "success",
+                    "text": reply.content or "",
+                    "audio_url": audio_url,
+                })
+            return json.dumps({
+                "status": "error",
+                "message": reply.content or "ASR failed",
+                "audio_url": audio_url,
+            })
+        except Exception as e:
+            logger.exception(f"[VoiceAsrHandler] failed: {e}")
+            return json.dumps({"status": "error", "message": str(e)})
+
+
 class UploadsHandler:
    def GET(self, file_name):
        _require_auth()
@@ -1232,7 +1359,7 @@ class ModelsHandler:

    # Capability -> editable flag, current-value resolver, and supported provider
    # ids drawn from ConfigHandler.PROVIDER_MODELS where applicable.
-    _ASR_PROVIDERS = ["openai", "linkai", "baidu", "ali", "xunfei", "azure", "google"]
+    _ASR_PROVIDERS = ["openai", "dashscope", "zhipu", "linkai"]
    _TTS_PROVIDERS = ["openai", "linkai", "minimax", "baidu", "ali", "xunfei", "azure", "google", "elevenlabs", "edge", "pytts"]
    _EMBEDDING_PROVIDERS = ["openai", "dashscope", "doubao", "zhipu", "linkai"]

@@ -1502,10 +1629,23 @@ class ModelsHandler:

    @classmethod
    def _asr_capability(cls, local_config: dict) -> dict:
-        provider_id = (local_config.get("voice_to_text") or "openai").strip().lower()
+        # "Pick or empty" — when voice_to_text is unset we don't show a
+        # current selection. `suggested_provider` previews which vendor
+        # the bridge auto-picker would land on (purely a UX hint, NOT
+        # persisted). Once the user saves a vendor, we lock onto it.
+        explicit = (local_config.get("voice_to_text") or "").strip().lower()
+        suggested = ""
+        if not explicit:
+            for pid in cls._ASR_PROVIDERS:
+                meta = ConfigHandler.PROVIDER_MODELS.get(pid) or {}
+                key_field = meta.get("api_key_field")
+                if key_field and cls._is_real_key(local_config.get(key_field, "")):
+                    suggested = pid
+                    break
        return {
            "editable": True,
-            "current_provider": provider_id,
+            "current_provider": explicit,
+            "suggested_provider": suggested,
            "current_model": "",
            "providers": cls._ASR_PROVIDERS,
        }
@@ -1897,6 +2037,10 @@ class ModelsHandler:
        file_cfg[key] = value
        self._write_file_config(file_cfg)
        logger.info(f"[ModelsHandler] {key} set: {value!r}")
+        # Bridge caches voice_to_text routing + bot instance; refresh it
+        # so the change takes effect on the next voice request.
+        if key in ("voice_to_text", "text_to_voice"):
+            self._refresh_voice_routing()
        return json.dumps({"status": "success", key: value})

    def _set_tts(self, provider_id: str, model: str) -> str:
@@ -1910,8 +2054,17 @@ class ModelsHandler:
            file_cfg["text_to_voice_model"] = model
        self._write_file_config(file_cfg)
        logger.info(f"[ModelsHandler] tts updated: provider={provider_id!r} model={model!r}")
+        self._refresh_voice_routing()
        return json.dumps({"status": "success", "provider": provider_id, "model": model})

+    @staticmethod
+    def _refresh_voice_routing() -> None:
+        try:
+            from bridge.bridge import Bridge
+            Bridge().refresh_voice()
+        except Exception as e:
+            logger.warning(f"[ModelsHandler] Bridge voice refresh failed: {e}")
+
    def _set_embedding(self, provider_id: str, model: str) -> str:
        # provider_id="" + model="" means "switch back to legacy auto mode".
        local_config = conf()
@@ -1926,9 +2079,9 @@ class ModelsHandler:
            file_cfg["embedding_model"] = ""
        self._write_file_config(file_cfg)
        logger.info(f"[ModelsHandler] embedding updated: provider={provider_id!r} model={model!r}")
-        # The agent's MemoryManager picks the new provider on next process
-        # restart; the index dim may now mismatch so a rebuild is needed.
-        # The frontend surfaces this via a confirm + post-save dialog.
+        # The next /memory rebuild-index command hot-swaps the provider onto
+        # the running MemoryManager (see plugins/cow_cli). The dim may have
+        # changed, so the frontend prompts the user to rebuild.
        return json.dumps({"status": "success", "provider": provider_id, "model": model})

    @staticmethod
--- a/voice/dashscope/init.py
+++ b/voice/dashscope/init.py
--- a/voice/dashscope/dashscope_voice.py
+++ b/voice/dashscope/dashscope_voice.py
@@ -0,0 +1,135 @@
+# encoding:utf-8
+"""
+DashScope (Aliyun Bailian) voice service.
+
+ASR : qwen3-asr-flash via dashscope.MultiModalConversation
+TTS : not yet implemented (see CosyVoice / qwen3-tts)
+
+Why MultiModalConversation instead of the OpenAI-compatible endpoint:
+  - SDK is already a project dep (used by chat/vision)
+  - Native API accepts local file:// paths up to 100 QPS without an OSS
+    round-trip, which is what we need for the "send a short voice
+    message" flow. Public URLs / Base64 also work.
+"""
+import os
+from typing import Optional
+
+import dashscope
+from dashscope import MultiModalConversation
+
+from bridge.reply import Reply, ReplyType
+from common.log import logger
+from config import conf
+from voice import audio_convert
+from voice.voice import Voice
+
+
+DEFAULT_ASR_MODEL = "qwen3-asr-flash"
+# qwen3-asr-flash hard cap (single file, sync call). Longer audio needs
+# qwen3-asr-flash-filetrans which is async-only and out of scope here.
+MAX_DURATION_SECONDS = 300
+MAX_FILE_BYTES = 10 * 1024 * 1024
+
+
+class DashScopeVoice(Voice):
+    def __init__(self):
+        # api_key is applied per-call (chat bot does the same) so a live
+        # config change via the web console takes effect without restart.
+        pass
+
+    def voiceToText(self, voice_file: str):
+        try:
+            voice_file = self._ensure_compatible_format(voice_file)
+
+            try:
+                size = os.path.getsize(voice_file)
+                if size > MAX_FILE_BYTES:
+                    logger.warning(
+                        f"[DashScopeVoice] audio file {size}B exceeds {MAX_FILE_BYTES}B; "
+                        f"qwen3-asr-flash may reject it"
+                    )
+            except OSError:
+                pass
+
+            api_key = conf().get("dashscope_api_key", "")
+            if not api_key:
+                logger.error("[DashScopeVoice] dashscope_api_key is not configured")
+                return Reply(ReplyType.ERROR, "未配置 DashScope API key")
+            dashscope.api_key = api_key
+
+            model = conf().get("voice_to_text_model") or DEFAULT_ASR_MODEL
+            abs_path = os.path.abspath(voice_file)
+            file_uri = f"file://{abs_path}"
+
+            messages = [
+                {"role": "user", "content": [{"audio": file_uri}]},
+            ]
+            response = MultiModalConversation.call(
+                model=model,
+                messages=messages,
+                result_format="message",
+                asr_options={"enable_itn": False, "enable_lid": True},
+            )
+
+            text = self._extract_text(response)
+            if text is None:
+                logger.error(f"[DashScopeVoice] voiceToText failed: {response}")
+                return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音，请稍后再试吧~")
+
+            logger.info(f"[DashScopeVoice] voiceToText model={model} text={text}")
+            return Reply(ReplyType.TEXT, text)
+        except Exception as e:
+            logger.exception(f"[DashScopeVoice] voiceToText exception: {e}")
+            return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音，请稍后再试吧~")
+
+    def textToVoice(self, text: str):
+        # TTS will be added in a follow-up commit (qwen3-tts / cosyvoice).
+        return Reply(ReplyType.ERROR, "DashScope 语音合成尚未接入")
+
+    @staticmethod
+    def _ensure_compatible_format(voice_file: str) -> str:
+        """Convert AMR/SILK to mp3 since qwen3-asr-flash doesn't accept them.
+        Other formats (mp3/wav/m4a/aac/opus/webm) are passed through.
+        """
+        lower = voice_file.lower()
+        if lower.endswith(".amr") or lower.endswith(".silk") or lower.endswith(".slk"):
+            try:
+                mp3_file = os.path.splitext(voice_file)[0] + ".mp3"
+                audio_convert.any_to_mp3(voice_file, mp3_file)
+                return mp3_file
+            except Exception as e:
+                logger.warning(
+                    f"[DashScopeVoice] convert {voice_file} to mp3 failed: {e}; "
+                    f"submitting original file"
+                )
+        return voice_file
+
+    @staticmethod
+    def _extract_text(response) -> Optional[str]:
+        """Pull the recognized text out of MultiModalConversation response.
+
+        Successful shape (result_format="message"):
+          response.output.choices[0].message.content -> list of {"text": "..."}
+          or in some SDK versions a plain string.
+        """
+        try:
+            if getattr(response, "status_code", 200) != 200:
+                return None
+            choices = response.output.get("choices") or []
+            if not choices:
+                return None
+            content = choices[0].get("message", {}).get("content")
+            if isinstance(content, str):
+                return content.strip() or None
+            if isinstance(content, list):
+                parts = []
+                for item in content:
+                    if isinstance(item, dict) and "text" in item:
+                        parts.append(item["text"])
+                    elif isinstance(item, str):
+                        parts.append(item)
+                text = "".join(parts).strip()
+                return text or None
+            return None
+        except Exception:
+            return None
--- a/voice/factory.py
+++ b/voice/factory.py
@@ -58,4 +58,12 @@ def create_voice(voice_type):
        from voice.minimax.minimax_voice import MinimaxVoice

        return MinimaxVoice()
+    elif voice_type == "dashscope":
+        from voice.dashscope.dashscope_voice import DashScopeVoice
+
+        return DashScopeVoice()
+    elif voice_type == "zhipu" or voice_type == "zhipuai":
+        from voice.zhipuai.zhipuai_voice import ZhipuAIVoice
+
+        return ZhipuAIVoice()
    raise RuntimeError
--- a/voice/zhipuai/init.py
+++ b/voice/zhipuai/init.py
--- a/voice/zhipuai/zhipuai_voice.py
+++ b/voice/zhipuai/zhipuai_voice.py
@@ -0,0 +1,102 @@
+# encoding:utf-8
+"""
+ZhipuAI (BigModel) voice service.
+
+ASR : glm-asr-2512 via the OpenAI-compatible /audio/transcriptions endpoint.
+TTS : not yet implemented.
+
+Endpoint accepts multipart/form-data with `model`, `file`, and `stream`.
+File size <= 25MB, duration <= 30s per request.
+"""
+import os
+
+import requests
+
+from bridge.reply import Reply, ReplyType
+from common.log import logger
+from config import conf
+from voice import audio_convert
+from voice.voice import Voice
+
+
+DEFAULT_ASR_MODEL = "glm-asr-2512"
+DEFAULT_API_BASE = "https://open.bigmodel.cn/api/paas/v4"
+MAX_FILE_BYTES = 25 * 1024 * 1024
+REQUEST_TIMEOUT = (5, 60)
+
+
+class ZhipuAIVoice(Voice):
+    def __init__(self):
+        # api_key / base read per-call so live config edits take effect.
+        pass
+
+    def voiceToText(self, voice_file: str):
+        try:
+            voice_file = self._ensure_compatible_format(voice_file)
+
+            try:
+                size = os.path.getsize(voice_file)
+                if size > MAX_FILE_BYTES:
+                    logger.warning(
+                        f"[ZhipuAIVoice] audio file {size}B exceeds {MAX_FILE_BYTES}B; "
+                        f"glm-asr-2512 may reject it"
+                    )
+            except OSError:
+                pass
+
+            api_key = conf().get("zhipu_ai_api_key", "")
+            if not api_key:
+                logger.error("[ZhipuAIVoice] zhipu_ai_api_key is not configured")
+                return Reply(ReplyType.ERROR, "未配置 ZhipuAI API key")
+
+            api_base = (conf().get("zhipu_ai_api_base") or DEFAULT_API_BASE).rstrip("/")
+            url = f"{api_base}/audio/transcriptions"
+            model = conf().get("voice_to_text_model") or DEFAULT_ASR_MODEL
+
+            with open(voice_file, "rb") as f:
+                files = {"file": (os.path.basename(voice_file), f)}
+                data = {"model": model, "stream": "false"}
+                headers = {"Authorization": f"Bearer {api_key}"}
+                response = requests.post(
+                    url, headers=headers, files=files, data=data, timeout=REQUEST_TIMEOUT
+                )
+
+            if response.status_code != 200:
+                logger.error(
+                    f"[ZhipuAIVoice] voiceToText failed: status={response.status_code} "
+                    f"body={response.text[:500]}"
+                )
+                return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音，请稍后再试吧~")
+
+            payload = response.json()
+            text = (payload.get("text") or "").strip()
+            if not text:
+                logger.error(f"[ZhipuAIVoice] voiceToText empty text: {payload}")
+                return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音，请稍后再试吧~")
+
+            logger.info(f"[ZhipuAIVoice] voiceToText model={model} text={text}")
+            return Reply(ReplyType.TEXT, text)
+        except Exception as e:
+            logger.exception(f"[ZhipuAIVoice] voiceToText exception: {e}")
+            return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音，请稍后再试吧~")
+
+    def textToVoice(self, text: str):
+        return Reply(ReplyType.ERROR, "ZhipuAI 语音合成尚未接入")
+
+    @staticmethod
+    def _ensure_compatible_format(voice_file: str) -> str:
+        # glm-asr-2512 only accepts .wav / .mp3 — convert everything else
+        # (webm from the browser mic, m4a/amr/silk from chat channels, etc).
+        lower = voice_file.lower()
+        if lower.endswith(".mp3") or lower.endswith(".wav"):
+            return voice_file
+        try:
+            mp3_file = os.path.splitext(voice_file)[0] + ".mp3"
+            audio_convert.any_to_mp3(voice_file, mp3_file)
+            return mp3_file
+        except Exception as e:
+            logger.warning(
+                f"[ZhipuAIVoice] convert {voice_file} to mp3 failed: {e}; "
+                f"submitting original file"
+            )
+            return voice_file