feat(models): support xiaomi mimo

2026-07-17 11:07:11 +08:00 · 2026-05-28 10:49:52 +08:00
parent 83cd6ad158
commit bccce2d7cb
22 changed files with 1340 additions and 3 deletions
--- a/voice/factory.py
+++ b/voice/factory.py
@@ -66,4 +66,8 @@ def create_voice(voice_type):
        from voice.zhipuai.zhipuai_voice import ZhipuAIVoice

        return ZhipuAIVoice()
+    elif voice_type == "mimo":
+        from voice.mimo.mimo_voice import MimoVoice
+
+        return MimoVoice()
    raise RuntimeError
--- a/voice/mimo/init.py
+++ b/voice/mimo/init.py
--- a/voice/mimo/mimo_voice.py
+++ b/voice/mimo/mimo_voice.py
@@ -0,0 +1,109 @@
+# encoding:utf-8
+"""
+小米 MiMo TTS - 基于 mimo-v2.5-tts 模型的语音合成。
+
+通过 /chat/completions 接口实现：assistant 消息内容为待合成文本，
+audio 字段指定预置音色（如 冰糖/茉莉/苏打/Mia/Chloe 等），返回 base64
+编码的音频字节。
+
+文档：https://platform.xiaomimimo.com/docs/zh-CN/usage-guide/speech-synthesis-v2.5
+注意：MiMo 不提供 ASR 端点，因此 voiceToText 不实现。
+"""
+import base64
+import datetime
+import os
+import random
+
+import requests
+
+from bridge.reply import Reply, ReplyType
+from common.log import logger
+from config import conf
+from voice.voice import Voice
+
+DEFAULT_API_BASE = "https://api.xiaomimimo.com/v1"
+DEFAULT_TTS_MODEL = "mimo-v2.5-tts"
+DEFAULT_TTS_VOICE = "冰糖"  # 默认音色：中国集群事实默认值
+REQUEST_TIMEOUT = (5, 120)
+
+
+class MimoVoice(Voice):
+    def __init__(self):
+        pass
+
+    def voiceToText(self, voice_file: str):
+        # MiMo 没有独立 ASR 端点；建议使用其他 provider（如 openai/zhipu/dashscope）
+        logger.warning("[MimoVoice] voiceToText is not supported by MiMo API")
+        return Reply(ReplyType.ERROR, "MiMo 暂不支持语音识别，请配置其他 voice_to_text provider")
+
+    def textToVoice(self, text: str):
+        try:
+            api_key = conf().get("mimo_api_key", "")
+            if not api_key:
+                logger.error("[MimoVoice] mimo_api_key is not configured")
+                return Reply(ReplyType.ERROR, "未配置 MiMo API key")
+
+            api_base = (conf().get("mimo_api_base") or DEFAULT_API_BASE).rstrip("/")
+            model = conf().get("text_to_voice_model") or DEFAULT_TTS_MODEL
+            voice_id = conf().get("tts_voice_id") or DEFAULT_TTS_VOICE
+
+            # 目标合成文本必须放在 assistant 消息；user 消息可选用作风格指令
+            payload = {
+                "model": model,
+                "messages": [
+                    {"role": "assistant", "content": text},
+                ],
+                "audio": {
+                    "format": "wav",
+                    "voice": voice_id,
+                },
+            }
+            headers = {
+                "Authorization": f"Bearer {api_key}",
+                "Content-Type": "application/json",
+            }
+            url = f"{api_base}/chat/completions"
+            response = requests.post(url, headers=headers, json=payload, timeout=REQUEST_TIMEOUT)
+
+            if response.status_code != 200:
+                logger.error(
+                    f"[MimoVoice] textToVoice failed: status={response.status_code} "
+                    f"body={response.text[:500]} model={model} voice={voice_id}"
+                )
+                return Reply(ReplyType.ERROR, "语音合成失败，请稍后再试")
+
+            data = response.json()
+            if "error" in data:
+                err = data["error"]
+                msg = err.get("message", str(err)) if isinstance(err, dict) else str(err)
+                logger.error(f"[MimoVoice] textToVoice api error: {msg}")
+                return Reply(ReplyType.ERROR, "语音合成失败，请稍后再试")
+
+            message = (data.get("choices") or [{}])[0].get("message", {}) or {}
+            audio_obj = message.get("audio") or {}
+            audio_b64 = audio_obj.get("data")
+            if not audio_b64:
+                logger.error(f"[MimoVoice] textToVoice empty audio in response: {data}")
+                return Reply(ReplyType.ERROR, "语音合成失败，请稍后再试")
+
+            try:
+                audio_bytes = base64.b64decode(audio_b64)
+            except Exception as e:
+                logger.error(f"[MimoVoice] base64 decode failed: {e}")
+                return Reply(ReplyType.ERROR, "语音合成失败，请稍后再试")
+
+            file_name = (
+                "tmp/" + datetime.datetime.now().strftime("%Y%m%d%H%M%S")
+                + str(random.randint(0, 1000)) + ".wav"
+            )
+            os.makedirs(os.path.dirname(file_name), exist_ok=True)
+            with open(file_name, "wb") as f:
+                f.write(audio_bytes)
+            logger.info(
+                f"[MimoVoice] textToVoice model={model} voice={voice_id} "
+                f"file={file_name} bytes={len(audio_bytes)}"
+            )
+            return Reply(ReplyType.VOICE, file_name)
+        except Exception as e:
+            logger.exception(f"[MimoVoice] textToVoice exception: {e}")
+            return Reply(ReplyType.ERROR, "语音合成失败，请稍后再试")