feat(voice): rework TTS/ASR stack and unify tool/skill config schema

2026-07-20 05:27:59 +08:00 · 2026-05-21 16:00:54 +08:00
parent 2b90f377e6
commit b8333e351c
31 changed files with 1551 additions and 335 deletions
--- a/voice/zhipuai/zhipuai_voice.py
+++ b/voice/zhipuai/zhipuai_voice.py
@@ -1,14 +1,8 @@
 # encoding:utf-8
-"""
-ZhipuAI (BigModel) voice service.
-
-ASR : glm-asr-2512 via the OpenAI-compatible /audio/transcriptions endpoint.
-TTS : not yet implemented.
-
-Endpoint accepts multipart/form-data with `model`, `file`, and `stream`.
-File size <= 25MB, duration <= 30s per request.
-"""
+"""ZhipuAI voice: glm-asr-2512 (ASR) + glm-tts (TTS) via BigModel REST API."""
+import datetime
 import os
+import random

 import requests

@@ -20,6 +14,8 @@ from voice.voice import Voice


 DEFAULT_ASR_MODEL = "glm-asr-2512"
+DEFAULT_TTS_MODEL = "glm-tts"
+DEFAULT_TTS_VOICE = "tongtong"
 DEFAULT_API_BASE = "https://open.bigmodel.cn/api/paas/v4"
 MAX_FILE_BYTES = 25 * 1024 * 1024
 REQUEST_TIMEOUT = (5, 60)
@@ -27,7 +23,6 @@ REQUEST_TIMEOUT = (5, 60)

 class ZhipuAIVoice(Voice):
    def __init__(self):
-        # api_key / base read per-call so live config edits take effect.
        pass

    def voiceToText(self, voice_file: str):
@@ -81,12 +76,91 @@ class ZhipuAIVoice(Voice):
            return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音，请稍后再试吧~")

    def textToVoice(self, text: str):
-        return Reply(ReplyType.ERROR, "ZhipuAI 语音合成尚未接入")
+        try:
+            api_key = conf().get("zhipu_ai_api_key", "")
+            if not api_key:
+                logger.error("[ZhipuAIVoice] zhipu_ai_api_key is not configured")
+                return Reply(ReplyType.ERROR, "未配置 ZhipuAI API key")
+
+            api_base = (conf().get("zhipu_ai_api_base") or DEFAULT_API_BASE).rstrip("/")
+            url = f"{api_base}/audio/speech"
+            model = conf().get("text_to_voice_model") or DEFAULT_TTS_MODEL
+            voice_id = conf().get("tts_voice_id") or DEFAULT_TTS_VOICE
+
+            payload = {
+                "model": model,
+                "input": text,
+                "voice": voice_id,
+                "response_format": "wav",
+                "speed": 1.0,
+                "volume": 1.0,
+            }
+            headers = {
+                "Authorization": f"Bearer {api_key}",
+                "Content-Type": "application/json",
+            }
+            response = requests.post(
+                url, headers=headers, json=payload, timeout=REQUEST_TIMEOUT
+            )
+
+            if response.status_code != 200:
+                logger.error(
+                    f"[ZhipuAIVoice] textToVoice failed: status={response.status_code} "
+                    f"body={response.text[:500]} model={model} voice={voice_id}"
+                )
+                return Reply(ReplyType.ERROR, "语音合成失败，请稍后再试")
+
+            # Some errors come back as JSON / SSE with HTTP 200.
+            ct = response.headers.get("Content-Type", "")
+            if "application/json" in ct or "text/event-stream" in ct:
+                try:
+                    err = response.json()
+                except Exception:
+                    err = {"raw": response.text[:500]}
+                logger.error(
+                    f"[ZhipuAIVoice] textToVoice unexpected text response "
+                    f"(content_type={ct}): {err}"
+                )
+                return Reply(ReplyType.ERROR, "语音合成失败，请稍后再试")
+
+            audio_bytes = response.content
+            ext = self._sniff_audio_ext(audio_bytes) or "wav"
+
+            file_name = (
+                "tmp/" + datetime.datetime.now().strftime("%Y%m%d%H%M%S")
+                + str(random.randint(0, 1000)) + "." + ext
+            )
+            os.makedirs(os.path.dirname(file_name), exist_ok=True)
+            with open(file_name, "wb") as f:
+                f.write(audio_bytes)
+            logger.info(
+                f"[ZhipuAIVoice] textToVoice model={model} voice={voice_id} "
+                f"file={file_name} bytes={len(audio_bytes)} ext={ext}"
+            )
+            return Reply(ReplyType.VOICE, file_name)
+        except Exception as e:
+            logger.exception(f"[ZhipuAIVoice] textToVoice exception: {e}")
+            return Reply(ReplyType.ERROR, "语音合成失败，请稍后再试")
+
+    @staticmethod
+    def _sniff_audio_ext(data: bytes) -> str:
+        """Detect audio container by magic bytes; returns '' on unknown."""
+        if len(data) < 12:
+            return ""
+        head = data[:12]
+        if head[:4] == b"RIFF" and head[8:12] == b"WAVE":
+            return "wav"
+        if head[:3] == b"ID3" or head[:2] == b"\xff\xfb" or head[:2] == b"\xff\xf3" or head[:2] == b"\xff\xf2":
+            return "mp3"
+        if head[:4] == b"OggS":
+            return "ogg"
+        if head[:4] == b"fLaC":
+            return "flac"
+        return ""

    @staticmethod
    def _ensure_compatible_format(voice_file: str) -> str:
-        # glm-asr-2512 only accepts .wav / .mp3 — convert everything else
-        # (webm from the browser mic, m4a/amr/silk from chat channels, etc).
+        # glm-asr-2512 only accepts .wav / .mp3
        lower = voice_file.lower()
        if lower.endswith(".mp3") or lower.endswith(".wav"):
            return voice_file
@@ -95,8 +169,5 @@ class ZhipuAIVoice(Voice):
            audio_convert.any_to_mp3(voice_file, mp3_file)
            return mp3_file
        except Exception as e:
-            logger.warning(
-                f"[ZhipuAIVoice] convert {voice_file} to mp3 failed: {e}; "
-                f"submitting original file"
-            )
+            logger.warning(f"[ZhipuAIVoice] mp3 convert failed: {e}")
            return voice_file