feat(models): support xiaomi mimo

This commit is contained in:
zhayujie
2026-05-28 10:49:52 +08:00
parent 83cd6ad158
commit bccce2d7cb
22 changed files with 1340 additions and 3 deletions

View File

@@ -66,4 +66,8 @@ def create_voice(voice_type):
from voice.zhipuai.zhipuai_voice import ZhipuAIVoice
return ZhipuAIVoice()
elif voice_type == "mimo":
from voice.mimo.mimo_voice import MimoVoice
return MimoVoice()
raise RuntimeError

0
voice/mimo/__init__.py Normal file
View File

109
voice/mimo/mimo_voice.py Normal file
View File

@@ -0,0 +1,109 @@
# encoding:utf-8
"""
小米 MiMo TTS - 基于 mimo-v2.5-tts 模型的语音合成。
通过 /chat/completions 接口实现assistant 消息内容为待合成文本,
audio 字段指定预置音色(如 冰糖/茉莉/苏打/Mia/Chloe 等),返回 base64
编码的音频字节。
文档https://platform.xiaomimimo.com/docs/zh-CN/usage-guide/speech-synthesis-v2.5
注意MiMo 不提供 ASR 端点,因此 voiceToText 不实现。
"""
import base64
import datetime
import os
import random
import requests
from bridge.reply import Reply, ReplyType
from common.log import logger
from config import conf
from voice.voice import Voice
DEFAULT_API_BASE = "https://api.xiaomimimo.com/v1"
DEFAULT_TTS_MODEL = "mimo-v2.5-tts"
DEFAULT_TTS_VOICE = "冰糖" # 默认音色:中国集群事实默认值
REQUEST_TIMEOUT = (5, 120)
class MimoVoice(Voice):
def __init__(self):
pass
def voiceToText(self, voice_file: str):
# MiMo 没有独立 ASR 端点;建议使用其他 provider如 openai/zhipu/dashscope
logger.warning("[MimoVoice] voiceToText is not supported by MiMo API")
return Reply(ReplyType.ERROR, "MiMo 暂不支持语音识别,请配置其他 voice_to_text provider")
def textToVoice(self, text: str):
try:
api_key = conf().get("mimo_api_key", "")
if not api_key:
logger.error("[MimoVoice] mimo_api_key is not configured")
return Reply(ReplyType.ERROR, "未配置 MiMo API key")
api_base = (conf().get("mimo_api_base") or DEFAULT_API_BASE).rstrip("/")
model = conf().get("text_to_voice_model") or DEFAULT_TTS_MODEL
voice_id = conf().get("tts_voice_id") or DEFAULT_TTS_VOICE
# 目标合成文本必须放在 assistant 消息user 消息可选用作风格指令
payload = {
"model": model,
"messages": [
{"role": "assistant", "content": text},
],
"audio": {
"format": "wav",
"voice": voice_id,
},
}
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
}
url = f"{api_base}/chat/completions"
response = requests.post(url, headers=headers, json=payload, timeout=REQUEST_TIMEOUT)
if response.status_code != 200:
logger.error(
f"[MimoVoice] textToVoice failed: status={response.status_code} "
f"body={response.text[:500]} model={model} voice={voice_id}"
)
return Reply(ReplyType.ERROR, "语音合成失败,请稍后再试")
data = response.json()
if "error" in data:
err = data["error"]
msg = err.get("message", str(err)) if isinstance(err, dict) else str(err)
logger.error(f"[MimoVoice] textToVoice api error: {msg}")
return Reply(ReplyType.ERROR, "语音合成失败,请稍后再试")
message = (data.get("choices") or [{}])[0].get("message", {}) or {}
audio_obj = message.get("audio") or {}
audio_b64 = audio_obj.get("data")
if not audio_b64:
logger.error(f"[MimoVoice] textToVoice empty audio in response: {data}")
return Reply(ReplyType.ERROR, "语音合成失败,请稍后再试")
try:
audio_bytes = base64.b64decode(audio_b64)
except Exception as e:
logger.error(f"[MimoVoice] base64 decode failed: {e}")
return Reply(ReplyType.ERROR, "语音合成失败,请稍后再试")
file_name = (
"tmp/" + datetime.datetime.now().strftime("%Y%m%d%H%M%S")
+ str(random.randint(0, 1000)) + ".wav"
)
os.makedirs(os.path.dirname(file_name), exist_ok=True)
with open(file_name, "wb") as f:
f.write(audio_bytes)
logger.info(
f"[MimoVoice] textToVoice model={model} voice={voice_id} "
f"file={file_name} bytes={len(audio_bytes)}"
)
return Reply(ReplyType.VOICE, file_name)
except Exception as e:
logger.exception(f"[MimoVoice] textToVoice exception: {e}")
return Reply(ReplyType.ERROR, "语音合成失败,请稍后再试")