feat(voice): add dashscope & zhipu ASR, in-page mic input

2026-07-19 12:47:25 +08:00 · 2026-05-20 22:36:37 +08:00
parent fff7326209
commit 2b90f377e6
9 changed files with 786 additions and 34 deletions
--- a/channel/web/web_channel.py
+++ b/channel/web/web_channel.py
@@ -1,10 +1,11 @@
+import datetime
 import hashlib
 import hmac
-import time
 import json
 import logging
 import mimetypes
 import os
+import random
 import threading
 import time
 import uuid
@@ -340,6 +341,10 @@ class WebChannel(ChatChannel):
        # Use a single-element list as a mutable counter accessible from closure.
        reasoning_chars_sent = [0]
        reasoning_capped_notified = [False]
+        # Captures the first error message emitted by agent_stream so the
+        # subsequent agent_end handler can skip its "empty final_response"
+        # fallback (which would otherwise overwrite the real error).
+        streamed_error: List[str] = []

        def on_event(event: dict):
            if request_id not in self.sse_queues:
@@ -398,6 +403,25 @@ class WebChannel(ChatChannel):
                if tool_calls:
                    q.put({"type": "message_end", "has_tool_calls": True})

+            elif event_type == "error":
+                # Agent raised an exception (LLM 401/timeout/etc). Surface the
+                # real message instead of letting the empty-response fallback
+                # below hide it as "(模型未返回任何内容)".
+                err_msg = data.get("error") or "unknown error"
+                logger.warning(
+                    f"[WebChannel] agent_stream emitted error for "
+                    f"request {request_id}: {err_msg}"
+                )
+                # Remember it so the agent_end handler below knows not to
+                # rewrite the message into a generic empty-response notice.
+                streamed_error.append(err_msg)
+                q.put({
+                    "type": "done",
+                    "content": f"❌ {err_msg}",
+                    "request_id": request_id,
+                    "timestamp": time.time(),
+                })
+
            elif event_type == "agent_end":
                # Safety net: if the agent finishes with an empty final_response,
                # chat_channel skips _send_reply (because reply.content is empty),
@@ -406,16 +430,21 @@ class WebChannel(ChatChannel):
                # here so the frontend always gets closure.
                final_response = data.get("final_response", "")
                if not final_response or not str(final_response).strip():
-                    logger.warning(
-                        f"[WebChannel] agent_end with empty final_response for "
-                        f"request {request_id}, sending fallback done"
-                    )
-                    q.put({
-                        "type": "done",
-                        "content": "(模型未返回任何内容，请重试或换一种方式描述你的需求)",
-                        "request_id": request_id,
-                        "timestamp": time.time(),
-                    })
+                    if streamed_error:
+                        # Error was already surfaced via the `error` event
+                        # handler above; nothing more to do here.
+                        pass
+                    else:
+                        logger.warning(
+                            f"[WebChannel] agent_end with empty final_response for "
+                            f"request {request_id}, sending fallback done"
+                        )
+                        q.put({
+                            "type": "done",
+                            "content": "(模型未返回任何内容，请重试或换一种方式描述你的需求)",
+                            "request_id": request_id,
+                            "timestamp": time.time(),
+                        })

            elif event_type == "file_to_send":
                file_path = data.get("path", "")
@@ -432,6 +461,39 @@ class WebChannel(ChatChannel):

        return on_event

+    @staticmethod
+    def _cleanup_stale_voice_recordings(max_age_seconds: int = 3600) -> None:
+        """Delete voice-input audio files older than `max_age_seconds`.
+
+        Called once at startup. Web mic recordings live in the upload
+        directory so the browser can replay them inside the conversation
+        bubble. We don't persist them to history, so once a process
+        restarts they're useless — but they're never auto-cleaned
+        anywhere else, so without this they accumulate over time.
+        """
+        try:
+            upload_dir = _get_upload_dir()
+            if not os.path.isdir(upload_dir):
+                return
+            now = time.time()
+            removed = 0
+            for name in os.listdir(upload_dir):
+                if not name.startswith("voice_input_"):
+                    continue
+                full = os.path.join(upload_dir, name)
+                try:
+                    if not os.path.isfile(full):
+                        continue
+                    if now - os.path.getmtime(full) > max_age_seconds:
+                        os.remove(full)
+                        removed += 1
+                except OSError:
+                    continue
+            if removed:
+                logger.info(f"[WebChannel] cleaned up {removed} stale voice recording(s) from {upload_dir}")
+        except Exception as e:
+            logger.warning(f"[WebChannel] voice cleanup failed: {e}")
+
    def upload_file(self):
        """Handle file or directory upload via multipart/form-data."""
        try:
@@ -703,6 +765,8 @@ class WebChannel(ChatChannel):
        port = conf().get("web_port", 9899)
        is_public_bind = host in ("0.0.0.0", "::")

+        self._cleanup_stale_voice_recordings()
+
        # 打印可用渠道类型提示
        logger.info(
            "[WebChannel] 全部可用通道如下，可修改 config.json 配置文件中的 channel_type 字段进行切换，多个通道用逗号分隔：")
@@ -746,6 +810,7 @@ class WebChannel(ChatChannel):
            '/upload', 'UploadHandler',
            '/uploads/(.*)', 'UploadsHandler',
            '/api/file', 'FileServeHandler',
+            '/api/voice/asr', 'VoiceAsrHandler',
            '/poll', 'PollHandler',
            '/stream', 'StreamHandler',
            '/chat', 'ChatHandler',
@@ -870,6 +935,68 @@ class UploadHandler:
        return WebChannel().upload_file()


+class VoiceAsrHandler:
+    """
+    Accept a short audio recording from the web console mic button,
+    save it under uploads/ so the browser can replay it, then run it
+    through the currently configured ASR provider.
+
+    Returns {status, text, audio_url} on success — the frontend renders
+    a voice-message bubble with the playable audio and the transcribed
+    caption.
+    """
+    def POST(self):
+        _require_auth()
+        web.header('Content-Type', 'application/json; charset=utf-8')
+
+        saved_path = None
+        try:
+            params = _raw_web_input()
+            file_obj = params.get("file")
+            if file_obj is None:
+                return json.dumps({"status": "error", "message": "no audio file"})
+
+            filename = getattr(file_obj, "filename", "") or "recording.webm"
+            ext = os.path.splitext(filename)[1].lower() or ".webm"
+            if ext not in (".webm", ".ogg", ".opus", ".mp4", ".m4a", ".mp3", ".wav"):
+                ext = ".webm"
+
+            upload_dir = _get_upload_dir()
+            os.makedirs(upload_dir, exist_ok=True)
+            ts = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
+            saved_name = f"voice_input_{ts}_{random.randint(0, 9999)}{ext}"
+            saved_path = os.path.join(upload_dir, saved_name)
+            with open(saved_path, "wb") as f:
+                f.write(file_obj.file.read() if hasattr(file_obj, "file") else file_obj.value)
+
+            audio_url = f"/uploads/{saved_name}"
+
+            from bridge.bridge import Bridge
+            reply = Bridge().fetch_voice_to_text(saved_path)
+            if reply is None:
+                return json.dumps({
+                    "status": "error",
+                    "message": "ASR returned no reply",
+                    "audio_url": audio_url,
+                })
+
+            from bridge.reply import ReplyType
+            if reply.type == ReplyType.TEXT:
+                return json.dumps({
+                    "status": "success",
+                    "text": reply.content or "",
+                    "audio_url": audio_url,
+                })
+            return json.dumps({
+                "status": "error",
+                "message": reply.content or "ASR failed",
+                "audio_url": audio_url,
+            })
+        except Exception as e:
+            logger.exception(f"[VoiceAsrHandler] failed: {e}")
+            return json.dumps({"status": "error", "message": str(e)})
+
+
 class UploadsHandler:
    def GET(self, file_name):
        _require_auth()
@@ -1232,7 +1359,7 @@ class ModelsHandler:

    # Capability -> editable flag, current-value resolver, and supported provider
    # ids drawn from ConfigHandler.PROVIDER_MODELS where applicable.
-    _ASR_PROVIDERS = ["openai", "linkai", "baidu", "ali", "xunfei", "azure", "google"]
+    _ASR_PROVIDERS = ["openai", "dashscope", "zhipu", "linkai"]
    _TTS_PROVIDERS = ["openai", "linkai", "minimax", "baidu", "ali", "xunfei", "azure", "google", "elevenlabs", "edge", "pytts"]
    _EMBEDDING_PROVIDERS = ["openai", "dashscope", "doubao", "zhipu", "linkai"]

@@ -1502,10 +1629,23 @@ class ModelsHandler:

    @classmethod
    def _asr_capability(cls, local_config: dict) -> dict:
-        provider_id = (local_config.get("voice_to_text") or "openai").strip().lower()
+        # "Pick or empty" — when voice_to_text is unset we don't show a
+        # current selection. `suggested_provider` previews which vendor
+        # the bridge auto-picker would land on (purely a UX hint, NOT
+        # persisted). Once the user saves a vendor, we lock onto it.
+        explicit = (local_config.get("voice_to_text") or "").strip().lower()
+        suggested = ""
+        if not explicit:
+            for pid in cls._ASR_PROVIDERS:
+                meta = ConfigHandler.PROVIDER_MODELS.get(pid) or {}
+                key_field = meta.get("api_key_field")
+                if key_field and cls._is_real_key(local_config.get(key_field, "")):
+                    suggested = pid
+                    break
        return {
            "editable": True,
-            "current_provider": provider_id,
+            "current_provider": explicit,
+            "suggested_provider": suggested,
            "current_model": "",
            "providers": cls._ASR_PROVIDERS,
        }
@@ -1897,6 +2037,10 @@ class ModelsHandler:
        file_cfg[key] = value
        self._write_file_config(file_cfg)
        logger.info(f"[ModelsHandler] {key} set: {value!r}")
+        # Bridge caches voice_to_text routing + bot instance; refresh it
+        # so the change takes effect on the next voice request.
+        if key in ("voice_to_text", "text_to_voice"):
+            self._refresh_voice_routing()
        return json.dumps({"status": "success", key: value})

    def _set_tts(self, provider_id: str, model: str) -> str:
@@ -1910,8 +2054,17 @@ class ModelsHandler:
            file_cfg["text_to_voice_model"] = model
        self._write_file_config(file_cfg)
        logger.info(f"[ModelsHandler] tts updated: provider={provider_id!r} model={model!r}")
+        self._refresh_voice_routing()
        return json.dumps({"status": "success", "provider": provider_id, "model": model})

+    @staticmethod
+    def _refresh_voice_routing() -> None:
+        try:
+            from bridge.bridge import Bridge
+            Bridge().refresh_voice()
+        except Exception as e:
+            logger.warning(f"[ModelsHandler] Bridge voice refresh failed: {e}")
+
    def _set_embedding(self, provider_id: str, model: str) -> str:
        # provider_id="" + model="" means "switch back to legacy auto mode".
        local_config = conf()
@@ -1926,9 +2079,9 @@ class ModelsHandler:
            file_cfg["embedding_model"] = ""
        self._write_file_config(file_cfg)
        logger.info(f"[ModelsHandler] embedding updated: provider={provider_id!r} model={model!r}")
-        # The agent's MemoryManager picks the new provider on next process
-        # restart; the index dim may now mismatch so a rebuild is needed.
-        # The frontend surfaces this via a confirm + post-save dialog.
+        # The next /memory rebuild-index command hot-swaps the provider onto
+        # the running MemoryManager (see plugins/cow_cli). The dim may have
+        # changed, so the frontend prompts the user to rebuild.
        return json.dumps({"status": "success", "provider": provider_id, "model": model})

    @staticmethod