feat(voice): rework TTS/ASR stack and unify tool/skill config schema

2026-07-17 11:07:11 +08:00 · 2026-05-21 16:00:54 +08:00
parent 2b90f377e6
commit b8333e351c
31 changed files with 1551 additions and 335 deletions
--- a/voice/dashscope/dashscope_voice.py
+++ b/voice/dashscope/dashscope_voice.py
@@ -1,20 +1,13 @@
 # encoding:utf-8
-"""
-DashScope (Aliyun Bailian) voice service.
-
-ASR : qwen3-asr-flash via dashscope.MultiModalConversation
-TTS : not yet implemented (see CosyVoice / qwen3-tts)
-
-Why MultiModalConversation instead of the OpenAI-compatible endpoint:
-  - SDK is already a project dep (used by chat/vision)
-  - Native API accepts local file:// paths up to 100 QPS without an OSS
-    round-trip, which is what we need for the "send a short voice
-    message" flow. Public URLs / Base64 also work.
-"""
+"""DashScope voice: qwen3-asr-flash (ASR) + qwen3-tts-flash (TTS)
+via dashscope.MultiModalConversation."""
+import datetime
 import os
+import random
 from typing import Optional

 import dashscope
+import requests
 from dashscope import MultiModalConversation

 from bridge.reply import Reply, ReplyType
@@ -25,16 +18,14 @@ from voice.voice import Voice


 DEFAULT_ASR_MODEL = "qwen3-asr-flash"
-# qwen3-asr-flash hard cap (single file, sync call). Longer audio needs
-# qwen3-asr-flash-filetrans which is async-only and out of scope here.
+DEFAULT_TTS_MODEL = "qwen3-tts-flash"
+DEFAULT_TTS_VOICE = "Cherry"
 MAX_DURATION_SECONDS = 300
 MAX_FILE_BYTES = 10 * 1024 * 1024


 class DashScopeVoice(Voice):
    def __init__(self):
-        # api_key is applied per-call (chat bot does the same) so a live
-        # config change via the web console takes effect without restart.
        pass

    def voiceToText(self, voice_file: str):
@@ -83,14 +74,72 @@ class DashScopeVoice(Voice):
            return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音，请稍后再试吧~")

    def textToVoice(self, text: str):
-        # TTS will be added in a follow-up commit (qwen3-tts / cosyvoice).
-        return Reply(ReplyType.ERROR, "DashScope 语音合成尚未接入")
+        try:
+            api_key = conf().get("dashscope_api_key", "")
+            if not api_key:
+                logger.error("[DashScopeVoice] dashscope_api_key is not configured")
+                return Reply(ReplyType.ERROR, "未配置 DashScope API key")
+            dashscope.api_key = api_key
+
+            model = conf().get("text_to_voice_model") or DEFAULT_TTS_MODEL
+            voice = conf().get("tts_voice_id") or DEFAULT_TTS_VOICE
+            response = MultiModalConversation.call(
+                model=model,
+                api_key=api_key,
+                text=text,
+                voice=voice,
+                stream=False,
+            )
+
+            url = self._extract_audio_url(response)
+            if not url:
+                logger.error(f"[DashScopeVoice] textToVoice failed: {response}")
+                return Reply(ReplyType.ERROR, "语音合成失败")
+
+            local_path = self._download_audio(url)
+            if not local_path:
+                return Reply(ReplyType.ERROR, "语音合成失败")
+
+            logger.info(f"[DashScopeVoice] textToVoice model={model} voice={voice} file={local_path}")
+            return Reply(ReplyType.VOICE, local_path)
+        except Exception as e:
+            logger.exception(f"[DashScopeVoice] textToVoice exception: {e}")
+            return Reply(ReplyType.ERROR, "语音合成失败")
+
+    @staticmethod
+    def _extract_audio_url(response) -> Optional[str]:
+        try:
+            if getattr(response, "status_code", 200) != 200:
+                return None
+            audio = response.output.get("audio") if response.output else None
+            if isinstance(audio, dict):
+                return audio.get("url") or None
+            return getattr(audio, "url", None)
+        except Exception:
+            return None
+
+    @staticmethod
+    def _download_audio(url: str) -> Optional[str]:
+        try:
+            tmp_dir = os.path.join(os.getcwd(), "tmp")
+            os.makedirs(tmp_dir, exist_ok=True)
+            ts = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
+            ext = os.path.splitext(url.split("?", 1)[0])[1].lower() or ".wav"
+            if ext not in (".mp3", ".wav", ".m4a", ".aac", ".opus"):
+                ext = ".wav"
+            dst = os.path.join(tmp_dir, f"dashscope_tts_{ts}_{random.randint(0, 9999)}{ext}")
+            resp = requests.get(url, timeout=60)
+            resp.raise_for_status()
+            with open(dst, "wb") as f:
+                f.write(resp.content)
+            return dst
+        except Exception as e:
+            logger.error(f"[DashScopeVoice] download audio failed: {e}")
+            return None

    @staticmethod
    def _ensure_compatible_format(voice_file: str) -> str:
-        """Convert AMR/SILK to mp3 since qwen3-asr-flash doesn't accept them.
-        Other formats (mp3/wav/m4a/aac/opus/webm) are passed through.
-        """
+        # qwen3-asr-flash doesn't accept AMR/SILK; mp3/wav/m4a/aac/opus pass through.
        lower = voice_file.lower()
        if lower.endswith(".amr") or lower.endswith(".silk") or lower.endswith(".slk"):
            try:
@@ -98,20 +147,11 @@ class DashScopeVoice(Voice):
                audio_convert.any_to_mp3(voice_file, mp3_file)
                return mp3_file
            except Exception as e:
-                logger.warning(
-                    f"[DashScopeVoice] convert {voice_file} to mp3 failed: {e}; "
-                    f"submitting original file"
-                )
+                logger.warning(f"[DashScopeVoice] mp3 convert failed: {e}")
        return voice_file

    @staticmethod
    def _extract_text(response) -> Optional[str]:
-        """Pull the recognized text out of MultiModalConversation response.
-
-        Successful shape (result_format="message"):
-          response.output.choices[0].message.content -> list of {"text": "..."}
-          or in some SDK versions a plain string.
-        """
        try:
            if getattr(response, "status_code", 200) != 200:
                return None
--- a/voice/linkai/linkai_voice.py
+++ b/voice/linkai/linkai_voice.py
@@ -1,16 +1,18 @@
-"""
-google voice service
-"""
+"""LinkAI voice: Whisper ASR + multi-vendor TTS (OpenAI / Doubao / Baidu)
+proxied via https://docs.link-ai.tech/platform/api/voice-speech."""
+import datetime
+import os
 import random
+
 import requests
-from voice import audio_convert
+
 from bridge.reply import Reply, ReplyType
+from common import const
 from common.log import logger
 from config import conf
+from voice import audio_convert
 from voice.voice import Voice
-from common import const
-import os
-import datetime
+

 class LinkAIVoice(Voice):
    def __init__(self):
@@ -21,63 +23,67 @@ class LinkAIVoice(Voice):
        try:
            url = conf().get("linkai_api_base", "https://api.link-ai.tech") + "/v1/audio/transcriptions"
            headers = {"Authorization": "Bearer " + conf().get("linkai_api_key")}
-            model = None
-            if not conf().get("text_to_voice") or conf().get("voice_to_text") == "openai":
-                model = const.WHISPER_1
+            # Pin whisper-1: gateway ignores any other ASR model id.
+            model = const.WHISPER_1
            if voice_file.endswith(".amr"):
                try:
                    mp3_file = os.path.splitext(voice_file)[0] + ".mp3"
                    audio_convert.any_to_mp3(voice_file, mp3_file)
                    voice_file = mp3_file
                except Exception as e:
-                    logger.warn(f"[LinkVoice] amr file transfer failed, directly send amr voice file: {format(e)}")
-            file = open(voice_file, "rb")
-            file_body = {
-                "file": file
-            }
-            data = {
-                "model": model
-            }
-            res = requests.post(url, files=file_body, headers=headers, data=data, timeout=(5, 60))
-            if res.status_code == 200:
-                text = res.json().get("text")
-            else:
-                res_json = res.json()
-                logger.error(f"[LinkVoice] voiceToText error, status_code={res.status_code}, msg={res_json.get('message')}")
+                    logger.warning(f"[LinkVoice] amr file transfer failed, directly send amr voice file: {e}")
+            with open(voice_file, "rb") as file:
+                res = requests.post(
+                    url,
+                    files={"file": file},
+                    headers=headers,
+                    data={"model": model},
+                    timeout=(5, 60),
+                )
+            if res.status_code != 200:
+                msg = ""
+                try:
+                    msg = res.json().get("message", "")
+                except Exception:
+                    pass
+                logger.error(f"[LinkVoice] voiceToText error, status_code={res.status_code}, msg={msg}")
                return None
-            reply = Reply(ReplyType.TEXT, text)
+            text = res.json().get("text")
            logger.info(f"[LinkVoice] voiceToText success, text={text}, file name={voice_file}")
+            return Reply(ReplyType.TEXT, text)
        except Exception as e:
            logger.error(e)
            return None
-        return reply

    def textToVoice(self, text):
        try:
            url = conf().get("linkai_api_base", "https://api.link-ai.tech") + "/v1/audio/speech"
            headers = {"Authorization": "Bearer " + conf().get("linkai_api_key")}
-            model = const.TTS_1
-            if not conf().get("text_to_voice") or conf().get("text_to_voice") in ["openai", const.TTS_1, const.TTS_1_HD]:
-                model = conf().get("text_to_voice_model") or const.TTS_1
+            # Gateway routes by `model` (tts-1 / doubao / baidu) + `voice` from
+            # that engine's catalog. `app_code` is optional workspace override.
            data = {
-                "model": model,
                "input": text,
                "voice": conf().get("tts_voice_id"),
-                "app_code": conf().get("linkai_app_code")
+                "app_code": conf().get("linkai_app_code"),
            }
+            model = conf().get("text_to_voice_model")
+            if model:
+                data["model"] = model
            res = requests.post(url, headers=headers, json=data, timeout=(5, 120))
-            if res.status_code == 200:
-                tmp_file_name = "tmp/" + datetime.datetime.now().strftime('%Y%m%d%H%M%S') + str(random.randint(0, 1000)) + ".mp3"
-                with open(tmp_file_name, 'wb') as f:
-                    f.write(res.content)
-                reply = Reply(ReplyType.VOICE, tmp_file_name)
-                logger.info(f"[LinkVoice] textToVoice success, input={text}, model={model}, voice_id={data.get('voice')}")
-                return reply
-            else:
-                res_json = res.json()
-                logger.error(f"[LinkVoice] textToVoice error, status_code={res.status_code}, msg={res_json.get('message')}")
+            if res.status_code != 200:
+                msg = ""
+                try:
+                    msg = res.json().get("message", "")
+                except Exception:
+                    pass
+                logger.error(f"[LinkVoice] textToVoice error, status_code={res.status_code}, msg={msg}")
                return None
+            tmp_file_name = "tmp/" + datetime.datetime.now().strftime('%Y%m%d%H%M%S') + str(random.randint(0, 1000)) + ".mp3"
+            os.makedirs(os.path.dirname(tmp_file_name), exist_ok=True)
+            with open(tmp_file_name, 'wb') as f:
+                f.write(res.content)
+            logger.info(f"[LinkVoice] textToVoice success, input={text}, voice_id={data.get('voice')}")
+            return Reply(ReplyType.VOICE, tmp_file_name)
        except Exception as e:
            logger.error(e)
-            # reply = Reply(ReplyType.ERROR, "遇到了一点小问题，请稍后再问我吧")
            return None
--- a/voice/minimax/minimax_voice.py
+++ b/voice/minimax/minimax_voice.py
@@ -1,8 +1,7 @@
 # encoding:utf-8
-"""
-MiniMax TTS voice service
-"""
+"""MiniMax TTS via /v1/t2a_v2 (SSE stream, hex-encoded mp3 chunks)."""
 import datetime
+import json
 import random
 import requests

@@ -12,24 +11,12 @@ from config import conf
 from voice.voice import Voice


-MINIMAX_TTS_VOICES = [
-    "English_Graceful_Lady",
-    "English_Insightful_Speaker",
-    "English_radiant_girl",
-    "English_Persuasive_Man",
-    "English_Lucky_Robot",
-    "English_expressive_narrator",
-    "Chinese_Warm_Woman",
-    "Chinese_Gentle_Man",
-]
-
-
 class MinimaxVoice(Voice):
    def __init__(self):
        self.api_key = conf().get("minimax_api_key")
-        self.api_base = conf().get("minimax_api_base") or "https://api.minimax.io"
-        # Strip trailing /v1 if present so we can always append /v1/t2a_v2
-        self.api_base = self.api_base.rstrip("/")
+        # Mainland endpoint matches `sk-api-0-...` keys; override via
+        # `minimax_api_base` for international (api.minimax.io) workspaces.
+        self.api_base = (conf().get("minimax_api_base") or "https://api.minimaxi.com").rstrip("/")
        if self.api_base.endswith("/v1"):
            self.api_base = self.api_base[:-3]

@@ -68,12 +55,14 @@ class MinimaxVoice(Voice):
            response = requests.post(url, headers=headers, json=payload, stream=True, timeout=60)
            response.raise_for_status()

-            # Parse SSE stream and collect hex-encoded audio chunks
+            # MiniMax returns HTTP 200 even on errors; capture base_resp for diagnostics.
            audio_chunks = []
-            buffer = ""
+            last_base_resp = None
+            event_count = 0
            for raw in response.iter_lines():
                if not raw:
                    continue
+                event_count += 1
                line = raw.decode("utf-8") if isinstance(raw, bytes) else raw
                if not line.startswith("data:"):
                    continue
@@ -81,16 +70,31 @@ class MinimaxVoice(Voice):
                if not json_str or json_str == "[DONE]":
                    continue
                try:
-                    import json
                    event_data = json.loads(json_str)
-                    audio_hex = event_data.get("data", {}).get("audio")
-                    if audio_hex:
-                        audio_chunks.append(bytes.fromhex(audio_hex))
                except Exception:
                    continue
+                base_resp = event_data.get("base_resp") or {}
+                if base_resp:
+                    last_base_resp = base_resp
+                audio_hex = (event_data.get("data") or {}).get("audio")
+                if audio_hex:
+                    try:
+                        audio_chunks.append(bytes.fromhex(audio_hex))
+                    except Exception as e:
+                        logger.warning(f"[MINIMAX] skip bad audio hex chunk: {e}")

            if not audio_chunks:
-                logger.error("[MINIMAX] TTS returned no audio data")
+                ct = response.headers.get("Content-Type", "")
+                if last_base_resp and last_base_resp.get("status_code") not in (None, 0):
+                    logger.error(
+                        f"[MINIMAX] TTS failed: status_code={last_base_resp.get('status_code')}, "
+                        f"status_msg={last_base_resp.get('status_msg')}, model={model}, voice_id={voice_id}"
+                    )
+                else:
+                    logger.error(
+                        f"[MINIMAX] TTS returned no audio data, model={model}, voice_id={voice_id}, "
+                        f"url={url}, http={response.status_code}, content_type={ct!r}, events={event_count}"
+                    )
                return Reply(ReplyType.ERROR, "语音合成失败，未获取到音频数据")

            audio_data = b"".join(audio_chunks)
--- a/voice/openai/openai_voice.py
+++ b/voice/openai/openai_voice.py
@@ -31,7 +31,8 @@ class OpenaiVoice(Voice):
                "file": file,
            }
            data = {
-                "model": "whisper-1",
+                # Override via `voice_to_text_model` (e.g. fall back to whisper-1).
+                "model": conf().get("voice_to_text_model") or "gpt-4o-mini-transcribe",
            }
            response = requests.post(url, headers=headers, files=files, data=data)
            response_data = response.json()
--- a/voice/zhipuai/zhipuai_voice.py
+++ b/voice/zhipuai/zhipuai_voice.py
@@ -1,14 +1,8 @@
 # encoding:utf-8
-"""
-ZhipuAI (BigModel) voice service.
-
-ASR : glm-asr-2512 via the OpenAI-compatible /audio/transcriptions endpoint.
-TTS : not yet implemented.
-
-Endpoint accepts multipart/form-data with `model`, `file`, and `stream`.
-File size <= 25MB, duration <= 30s per request.
-"""
+"""ZhipuAI voice: glm-asr-2512 (ASR) + glm-tts (TTS) via BigModel REST API."""
+import datetime
 import os
+import random

 import requests

@@ -20,6 +14,8 @@ from voice.voice import Voice


 DEFAULT_ASR_MODEL = "glm-asr-2512"
+DEFAULT_TTS_MODEL = "glm-tts"
+DEFAULT_TTS_VOICE = "tongtong"
 DEFAULT_API_BASE = "https://open.bigmodel.cn/api/paas/v4"
 MAX_FILE_BYTES = 25 * 1024 * 1024
 REQUEST_TIMEOUT = (5, 60)
@@ -27,7 +23,6 @@ REQUEST_TIMEOUT = (5, 60)

 class ZhipuAIVoice(Voice):
    def __init__(self):
-        # api_key / base read per-call so live config edits take effect.
        pass

    def voiceToText(self, voice_file: str):
@@ -81,12 +76,91 @@ class ZhipuAIVoice(Voice):
            return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音，请稍后再试吧~")

    def textToVoice(self, text: str):
-        return Reply(ReplyType.ERROR, "ZhipuAI 语音合成尚未接入")
+        try:
+            api_key = conf().get("zhipu_ai_api_key", "")
+            if not api_key:
+                logger.error("[ZhipuAIVoice] zhipu_ai_api_key is not configured")
+                return Reply(ReplyType.ERROR, "未配置 ZhipuAI API key")
+
+            api_base = (conf().get("zhipu_ai_api_base") or DEFAULT_API_BASE).rstrip("/")
+            url = f"{api_base}/audio/speech"
+            model = conf().get("text_to_voice_model") or DEFAULT_TTS_MODEL
+            voice_id = conf().get("tts_voice_id") or DEFAULT_TTS_VOICE
+
+            payload = {
+                "model": model,
+                "input": text,
+                "voice": voice_id,
+                "response_format": "wav",
+                "speed": 1.0,
+                "volume": 1.0,
+            }
+            headers = {
+                "Authorization": f"Bearer {api_key}",
+                "Content-Type": "application/json",
+            }
+            response = requests.post(
+                url, headers=headers, json=payload, timeout=REQUEST_TIMEOUT
+            )
+
+            if response.status_code != 200:
+                logger.error(
+                    f"[ZhipuAIVoice] textToVoice failed: status={response.status_code} "
+                    f"body={response.text[:500]} model={model} voice={voice_id}"
+                )
+                return Reply(ReplyType.ERROR, "语音合成失败，请稍后再试")
+
+            # Some errors come back as JSON / SSE with HTTP 200.
+            ct = response.headers.get("Content-Type", "")
+            if "application/json" in ct or "text/event-stream" in ct:
+                try:
+                    err = response.json()
+                except Exception:
+                    err = {"raw": response.text[:500]}
+                logger.error(
+                    f"[ZhipuAIVoice] textToVoice unexpected text response "
+                    f"(content_type={ct}): {err}"
+                )
+                return Reply(ReplyType.ERROR, "语音合成失败，请稍后再试")
+
+            audio_bytes = response.content
+            ext = self._sniff_audio_ext(audio_bytes) or "wav"
+
+            file_name = (
+                "tmp/" + datetime.datetime.now().strftime("%Y%m%d%H%M%S")
+                + str(random.randint(0, 1000)) + "." + ext
+            )
+            os.makedirs(os.path.dirname(file_name), exist_ok=True)
+            with open(file_name, "wb") as f:
+                f.write(audio_bytes)
+            logger.info(
+                f"[ZhipuAIVoice] textToVoice model={model} voice={voice_id} "
+                f"file={file_name} bytes={len(audio_bytes)} ext={ext}"
+            )
+            return Reply(ReplyType.VOICE, file_name)
+        except Exception as e:
+            logger.exception(f"[ZhipuAIVoice] textToVoice exception: {e}")
+            return Reply(ReplyType.ERROR, "语音合成失败，请稍后再试")
+
+    @staticmethod
+    def _sniff_audio_ext(data: bytes) -> str:
+        """Detect audio container by magic bytes; returns '' on unknown."""
+        if len(data) < 12:
+            return ""
+        head = data[:12]
+        if head[:4] == b"RIFF" and head[8:12] == b"WAVE":
+            return "wav"
+        if head[:3] == b"ID3" or head[:2] == b"\xff\xfb" or head[:2] == b"\xff\xf3" or head[:2] == b"\xff\xf2":
+            return "mp3"
+        if head[:4] == b"OggS":
+            return "ogg"
+        if head[:4] == b"fLaC":
+            return "flac"
+        return ""

    @staticmethod
    def _ensure_compatible_format(voice_file: str) -> str:
-        # glm-asr-2512 only accepts .wav / .mp3 — convert everything else
-        # (webm from the browser mic, m4a/amr/silk from chat channels, etc).
+        # glm-asr-2512 only accepts .wav / .mp3
        lower = voice_file.lower()
        if lower.endswith(".mp3") or lower.endswith(".wav"):
            return voice_file
@@ -95,8 +169,5 @@ class ZhipuAIVoice(Voice):
            audio_convert.any_to_mp3(voice_file, mp3_file)
            return mp3_file
        except Exception as e:
-            logger.warning(
-                f"[ZhipuAIVoice] convert {voice_file} to mp3 failed: {e}; "
-                f"submitting original file"
-            )
+            logger.warning(f"[ZhipuAIVoice] mp3 convert failed: {e}")
            return voice_file