feat(voice): add dashscope & zhipu ASR, in-page mic input

2026-07-17 11:07:11 +08:00 · 2026-05-20 22:36:37 +08:00
parent fff7326209
commit 2b90f377e6
9 changed files with 786 additions and 34 deletions
--- a/voice/dashscope/init.py
+++ b/voice/dashscope/init.py
--- a/voice/dashscope/dashscope_voice.py
+++ b/voice/dashscope/dashscope_voice.py
@@ -0,0 +1,135 @@
+# encoding:utf-8
+"""
+DashScope (Aliyun Bailian) voice service.
+
+ASR : qwen3-asr-flash via dashscope.MultiModalConversation
+TTS : not yet implemented (see CosyVoice / qwen3-tts)
+
+Why MultiModalConversation instead of the OpenAI-compatible endpoint:
+  - SDK is already a project dep (used by chat/vision)
+  - Native API accepts local file:// paths up to 100 QPS without an OSS
+    round-trip, which is what we need for the "send a short voice
+    message" flow. Public URLs / Base64 also work.
+"""
+import os
+from typing import Optional
+
+import dashscope
+from dashscope import MultiModalConversation
+
+from bridge.reply import Reply, ReplyType
+from common.log import logger
+from config import conf
+from voice import audio_convert
+from voice.voice import Voice
+
+
+DEFAULT_ASR_MODEL = "qwen3-asr-flash"
+# qwen3-asr-flash hard cap (single file, sync call). Longer audio needs
+# qwen3-asr-flash-filetrans which is async-only and out of scope here.
+MAX_DURATION_SECONDS = 300
+MAX_FILE_BYTES = 10 * 1024 * 1024
+
+
+class DashScopeVoice(Voice):
+    def __init__(self):
+        # api_key is applied per-call (chat bot does the same) so a live
+        # config change via the web console takes effect without restart.
+        pass
+
+    def voiceToText(self, voice_file: str):
+        try:
+            voice_file = self._ensure_compatible_format(voice_file)
+
+            try:
+                size = os.path.getsize(voice_file)
+                if size > MAX_FILE_BYTES:
+                    logger.warning(
+                        f"[DashScopeVoice] audio file {size}B exceeds {MAX_FILE_BYTES}B; "
+                        f"qwen3-asr-flash may reject it"
+                    )
+            except OSError:
+                pass
+
+            api_key = conf().get("dashscope_api_key", "")
+            if not api_key:
+                logger.error("[DashScopeVoice] dashscope_api_key is not configured")
+                return Reply(ReplyType.ERROR, "未配置 DashScope API key")
+            dashscope.api_key = api_key
+
+            model = conf().get("voice_to_text_model") or DEFAULT_ASR_MODEL
+            abs_path = os.path.abspath(voice_file)
+            file_uri = f"file://{abs_path}"
+
+            messages = [
+                {"role": "user", "content": [{"audio": file_uri}]},
+            ]
+            response = MultiModalConversation.call(
+                model=model,
+                messages=messages,
+                result_format="message",
+                asr_options={"enable_itn": False, "enable_lid": True},
+            )
+
+            text = self._extract_text(response)
+            if text is None:
+                logger.error(f"[DashScopeVoice] voiceToText failed: {response}")
+                return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音，请稍后再试吧~")
+
+            logger.info(f"[DashScopeVoice] voiceToText model={model} text={text}")
+            return Reply(ReplyType.TEXT, text)
+        except Exception as e:
+            logger.exception(f"[DashScopeVoice] voiceToText exception: {e}")
+            return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音，请稍后再试吧~")
+
+    def textToVoice(self, text: str):
+        # TTS will be added in a follow-up commit (qwen3-tts / cosyvoice).
+        return Reply(ReplyType.ERROR, "DashScope 语音合成尚未接入")
+
+    @staticmethod
+    def _ensure_compatible_format(voice_file: str) -> str:
+        """Convert AMR/SILK to mp3 since qwen3-asr-flash doesn't accept them.
+        Other formats (mp3/wav/m4a/aac/opus/webm) are passed through.
+        """
+        lower = voice_file.lower()
+        if lower.endswith(".amr") or lower.endswith(".silk") or lower.endswith(".slk"):
+            try:
+                mp3_file = os.path.splitext(voice_file)[0] + ".mp3"
+                audio_convert.any_to_mp3(voice_file, mp3_file)
+                return mp3_file
+            except Exception as e:
+                logger.warning(
+                    f"[DashScopeVoice] convert {voice_file} to mp3 failed: {e}; "
+                    f"submitting original file"
+                )
+        return voice_file
+
+    @staticmethod
+    def _extract_text(response) -> Optional[str]:
+        """Pull the recognized text out of MultiModalConversation response.
+
+        Successful shape (result_format="message"):
+          response.output.choices[0].message.content -> list of {"text": "..."}
+          or in some SDK versions a plain string.
+        """
+        try:
+            if getattr(response, "status_code", 200) != 200:
+                return None
+            choices = response.output.get("choices") or []
+            if not choices:
+                return None
+            content = choices[0].get("message", {}).get("content")
+            if isinstance(content, str):
+                return content.strip() or None
+            if isinstance(content, list):
+                parts = []
+                for item in content:
+                    if isinstance(item, dict) and "text" in item:
+                        parts.append(item["text"])
+                    elif isinstance(item, str):
+                        parts.append(item)
+                text = "".join(parts).strip()
+                return text or None
+            return None
+        except Exception:
+            return None
--- a/voice/factory.py
+++ b/voice/factory.py
@@ -58,4 +58,12 @@ def create_voice(voice_type):
        from voice.minimax.minimax_voice import MinimaxVoice

        return MinimaxVoice()
+    elif voice_type == "dashscope":
+        from voice.dashscope.dashscope_voice import DashScopeVoice
+
+        return DashScopeVoice()
+    elif voice_type == "zhipu" or voice_type == "zhipuai":
+        from voice.zhipuai.zhipuai_voice import ZhipuAIVoice
+
+        return ZhipuAIVoice()
    raise RuntimeError
--- a/voice/zhipuai/init.py
+++ b/voice/zhipuai/init.py
--- a/voice/zhipuai/zhipuai_voice.py
+++ b/voice/zhipuai/zhipuai_voice.py
@@ -0,0 +1,102 @@
+# encoding:utf-8
+"""
+ZhipuAI (BigModel) voice service.
+
+ASR : glm-asr-2512 via the OpenAI-compatible /audio/transcriptions endpoint.
+TTS : not yet implemented.
+
+Endpoint accepts multipart/form-data with `model`, `file`, and `stream`.
+File size <= 25MB, duration <= 30s per request.
+"""
+import os
+
+import requests
+
+from bridge.reply import Reply, ReplyType
+from common.log import logger
+from config import conf
+from voice import audio_convert
+from voice.voice import Voice
+
+
+DEFAULT_ASR_MODEL = "glm-asr-2512"
+DEFAULT_API_BASE = "https://open.bigmodel.cn/api/paas/v4"
+MAX_FILE_BYTES = 25 * 1024 * 1024
+REQUEST_TIMEOUT = (5, 60)
+
+
+class ZhipuAIVoice(Voice):
+    def __init__(self):
+        # api_key / base read per-call so live config edits take effect.
+        pass
+
+    def voiceToText(self, voice_file: str):
+        try:
+            voice_file = self._ensure_compatible_format(voice_file)
+
+            try:
+                size = os.path.getsize(voice_file)
+                if size > MAX_FILE_BYTES:
+                    logger.warning(
+                        f"[ZhipuAIVoice] audio file {size}B exceeds {MAX_FILE_BYTES}B; "
+                        f"glm-asr-2512 may reject it"
+                    )
+            except OSError:
+                pass
+
+            api_key = conf().get("zhipu_ai_api_key", "")
+            if not api_key:
+                logger.error("[ZhipuAIVoice] zhipu_ai_api_key is not configured")
+                return Reply(ReplyType.ERROR, "未配置 ZhipuAI API key")
+
+            api_base = (conf().get("zhipu_ai_api_base") or DEFAULT_API_BASE).rstrip("/")
+            url = f"{api_base}/audio/transcriptions"
+            model = conf().get("voice_to_text_model") or DEFAULT_ASR_MODEL
+
+            with open(voice_file, "rb") as f:
+                files = {"file": (os.path.basename(voice_file), f)}
+                data = {"model": model, "stream": "false"}
+                headers = {"Authorization": f"Bearer {api_key}"}
+                response = requests.post(
+                    url, headers=headers, files=files, data=data, timeout=REQUEST_TIMEOUT
+                )
+
+            if response.status_code != 200:
+                logger.error(
+                    f"[ZhipuAIVoice] voiceToText failed: status={response.status_code} "
+                    f"body={response.text[:500]}"
+                )
+                return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音，请稍后再试吧~")
+
+            payload = response.json()
+            text = (payload.get("text") or "").strip()
+            if not text:
+                logger.error(f"[ZhipuAIVoice] voiceToText empty text: {payload}")
+                return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音，请稍后再试吧~")
+
+            logger.info(f"[ZhipuAIVoice] voiceToText model={model} text={text}")
+            return Reply(ReplyType.TEXT, text)
+        except Exception as e:
+            logger.exception(f"[ZhipuAIVoice] voiceToText exception: {e}")
+            return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音，请稍后再试吧~")
+
+    def textToVoice(self, text: str):
+        return Reply(ReplyType.ERROR, "ZhipuAI 语音合成尚未接入")
+
+    @staticmethod
+    def _ensure_compatible_format(voice_file: str) -> str:
+        # glm-asr-2512 only accepts .wav / .mp3 — convert everything else
+        # (webm from the browser mic, m4a/amr/silk from chat channels, etc).
+        lower = voice_file.lower()
+        if lower.endswith(".mp3") or lower.endswith(".wav"):
+            return voice_file
+        try:
+            mp3_file = os.path.splitext(voice_file)[0] + ".mp3"
+            audio_convert.any_to_mp3(voice_file, mp3_file)
+            return mp3_file
+        except Exception as e:
+            logger.warning(
+                f"[ZhipuAIVoice] convert {voice_file} to mp3 failed: {e}; "
+                f"submitting original file"
+            )
+            return voice_file